diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66dcef0013..d6aa8f1b85 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
+option(WITH_JEMALLOC    "Compile PaddlePaddle with jemalloc"            OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
@@ -261,6 +262,12 @@ if (WITH_PROFILER)
     add_definitions(-DWITH_GPERFTOOLS)
 endif()
 
+if (WITH_JEMALLOC)
+    find_package(JeMalloc REQUIRED)
+    include_directories(${JEMALLOC_INCLUDE_DIR})
+    add_definitions(-DWITH_JEMALLOC)
+endif()
+
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
@@ -290,7 +297,7 @@ if(WITH_PSLIB)
     list(APPEND EXTERNAL_LIBS pslib_brpc)
     list(APPEND EXTERNAL_LIBS libmct)
 endif(WITH_PSLIB)
-    
+
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/Dockerfile b/Dockerfile
index 84e1edbee9..acfd091265 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 install -U wheel && \
-    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.6 install -U wheel && \
-    pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.7 install -U wheel && \
-    pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \
+RUN pip3 --no-cache-dir install -U wheel && \
+    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.6 --no-cache-dir install -U wheel && \
+    pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.7 --no-cache-dir install -U wheel && \
+    pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     easy_install -U pip && \
-    pip install -U pip setuptools wheel && \
-    pip install -U docopt PyYAML sphinx==1.5.6 && \
-    pip install sphinx-rtd-theme==0.1.9 recommonmark
-
-RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 install opencv-python && \
-    pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 install opencv-python && \
-    pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 install opencv-python && \
-    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install opencv-python
+    pip --no-cache-dir install -U pip setuptools wheel && \
+    pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
+
+RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 --no-cache-dir install opencv-python && \
+    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.6 --no-cache-dir install opencv-python && \
+    pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.7 --no-cache-dir install opencv-python && \
+    pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip --no-cache-dir install opencv-python
 
 #For docstring checker
-RUN pip3 install pylint pytest astroid isort
-RUN pip3.6 install pylint pytest astroid isort
-RUN pip3.7 install pylint pytest astroid isort
-RUN pip install pylint pytest astroid isort LinkChecker
+RUN pip3 --no-cache-dir install pylint pytest astroid isort
+RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
+RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
+RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
 
 COPY ./python/requirements.txt /root/
-RUN pip3 install -r /root/requirements.txt
-RUN pip3.6 install -r /root/requirements.txt
-RUN pip3.7 install -r /root/requirements.txt
-RUN pip install -r /root/requirements.txt
+RUN pip3 --no-cache-dir install -r /root/requirements.txt
+RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
+RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
+RUN pip --no-cache-dir install -r /root/requirements.txt
 
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
-RUN apt-get install -y libssl-dev libffi-dev
-RUN pip3 install certifi urllib3[secure]
-RUN pip3.6 install certifi urllib3[secure]
-RUN pip3.7 install certifi urllib3[secure]
-RUN pip install certifi urllib3[secure]
+RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y
+RUN pip3 --no-cache-dir install certifi urllib3[secure]
+RUN pip3.6 --no-cache-dir install certifi urllib3[secure]
+RUN pip3.7 --no-cache-dir install certifi urllib3[secure]
+RUN pip --no-cache-dir install certifi urllib3[secure]
 
 
 # Install woboq_codebrowser to /woboq
@@ -149,6 +149,14 @@ RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
            -DCMAKE_BUILD_TYPE=Release . \
      make)
 
+# ar mishandles 4GB files
+# https://sourceware.org/bugzilla/show_bug.cgi?id=14625
+# remove them when apt-get support 2.27 and higher version
+RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/2.27-9ubuntu1/binutils_2.27.orig.tar.gz && \
+    tar -xzf binutils_2.27.orig.tar.gz && \
+    cd binutils-2.27 && \
+    ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
+
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
 RUN echo 'root:root' | chpasswd
diff --git a/cmake/FindJeMalloc.cmake b/cmake/FindJeMalloc.cmake
new file mode 100644
index 0000000000..7911f77c4c
--- /dev/null
+++ b/cmake/FindJeMalloc.cmake
@@ -0,0 +1,21 @@
+# - Find JeMalloc library
+# Find the native JeMalloc includes and library
+#
+# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc.
+# JEMALLOC_LIBRARIES - List of libraries when using jemalloc.
+# JEMALLOC_FOUND - True if jemalloc found.
+
+find_path(JEMALLOC_INCLUDE_DIR
+  NAMES jemalloc/jemalloc.h
+  HINTS ${JEMALLOC_ROOT_DIR}/include)
+
+find_library(JEMALLOC_LIBRARIES
+  NAMES jemalloc
+  HINTS ${JEMALLOC_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR)
+
+mark_as_advanced(
+  JEMALLOC_LIBRARIES
+  JEMALLOC_INCLUDE_DIR)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 4ee2fdcf2d..e3d856fb30 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -134,6 +134,7 @@ if(WITH_GPU)
             message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
             set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
         endif()
+        add_definitions(-DWITH_ANAKIN)
     endif()
     if(WITH_ANAKIN)
         # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 414e92eb27..10ecdf0ea8 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -2,7 +2,7 @@ if(NOT WITH_GPU)
     return()
 endif()
 
-set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70 75")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
 
@@ -59,7 +59,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
   # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
   set(archs_name_default "All")
   if(NOT CMAKE_CROSSCOMPILING)
     list(APPEND archs_names "Auto")
@@ -93,6 +93,8 @@ function(select_nvcc_arch_flags out_variable)
     set(cuda_arch_bin "60 61")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
     set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
+    set(cuda_arch_bin "75")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -139,10 +141,12 @@ endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
   list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
@@ -150,6 +154,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
   # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
   # warning for now.
   list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
 endif()
 
 include_directories(${CUDA_INCLUDE_DIRS})
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index fb899e3d7c..fff1980637 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -89,6 +89,7 @@ if(CUDNN_FOUND)
         if(NOT CUDNN_MAJOR_VERSION)
             set(CUDNN_VERSION "???")
         else()
+            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
             math(EXPR CUDNN_VERSION
                 "${CUDNN_MAJOR_VERSION} * 1000 +
                  ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index c94849cf4b..f06728de91 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -32,4 +32,4 @@ endif()
 
 add_dependencies(cub extern_cub)
 
-LIST(APPEND externl_project_dependencies cub)
+LIST(APPEND external_project_dependencies cub)
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
index 94d8fcc668..4587475d79 100644
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -28,4 +28,4 @@ endif()
 
 add_dependencies(dlpack extern_dlpack)
 
-LIST(APPEND externl_project_dependencies dlpack)
+LIST(APPEND external_project_dependencies dlpack)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index c29375cd05..a9b99e9ab8 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -106,10 +106,10 @@ else(WIN32)
     SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
     ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
             COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-            DEPENDS mkldnn)
+            DEPENDS mkldnn shared_mkldnn)
 endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
-
+ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
 IF(WITH_C_API)
   INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
 ENDIF()
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index d49839a89d..96127e78d6 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,14 +16,6 @@ IF(NOT ${WITH_MKLML})
   return()
 ENDIF(NOT ${WITH_MKLML})
 
-IF(APPLE)
-    MESSAGE(WARNING
-        "Mac is not supported with MKLML in Paddle yet."
-        "Force WITH_MKLML=OFF")
-    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
-    return()
-ENDIF()
-
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@@ -47,10 +39,13 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
     MESSAGE(STATUS "use pre defined download url")
     if(WIN32)
-        SET(MKLML_VER "mklml_win_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_VER "mklml_win_2019.0.1.20180928" CACHE STRING "" FORCE)
         SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    elseif(APPLE)
+        SET(MKLML_VER "mklml_mac_2019.0.1.20180928" CACHE STRING "" FORCE)
+        SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
     else()
-        SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
+        SET(MKLML_VER "mklml_lnx_2019.0.1.20180928" CACHE STRING "" FORCE)
         SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
     ENDIF()
 endif()
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index e66459fa3a..799d9c309f 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_VERSION         "0.9")
-SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
+SET(NGRAPH_GIT_TAG         "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
 SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
+SET(NGRAPH_SHARED_LIB_NAME libngraph.so)
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c6fe2e970d..4e31392b98 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -115,6 +115,10 @@ function(common_link TARGET_NAME)
   if (WITH_PROFILER)
     target_link_libraries(${TARGET_NAME} gperftools::profiler)
   endif()
+
+  if (WITH_JEMALLOC)
+    target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES})
+  endif()
 endfunction()
 
 
@@ -228,7 +232,7 @@ function(merge_static_libs TARGET_NAME)
       # Get the file names of the libraries to be merged
       set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
     endforeach()
-    # msvc will put libarary in directory of "/Release/xxxlib" by default 
+    # msvc will put libarary in directory of "/Release/xxxlib" by default
     #       COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
     add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 48279bc809..3e11d332ff 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -136,7 +136,7 @@ if (WITH_MKLDNN)
     copy(mkldnn_lib
             SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
             DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS mkldnn
+            DEPS mkldnn_shared_lib
             )
 endif ()
 
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 70d159b4f3..59c40a0e5d 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -110,7 +110,7 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 86096d4fea..566dc75fda 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,46 +57,43 @@ int main()
     return 0;
 }" SSE3_FOUND)
 
-# disable AVX by default on windows
-if(NOT WIN32)
-    # Check AVX
-    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-        __m256 result = _mm256_add_ps (a, b);
-        return 0;
-    }" AVX_FOUND)
+# Check AVX
+set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    __m256 result = _mm256_add_ps (a, b);
+    return 0;
+}" AVX_FOUND)
 
-    # Check AVX 2
-    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-        __m256i result = _mm256_abs_epi32 (a);
-        return 0;
-    }" AVX2_FOUND)
+# Check AVX 2
+set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i result = _mm256_abs_epi32 (a);
+    return 0;
+}" AVX2_FOUND)
 
-    # Check AVX512F
-    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-    CHECK_CXX_SOURCE_RUNS("
-    #include <immintrin.h>
-    int main()
-    {
-        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                      13, -5, 6, -7, 9, 2, -6, 3);
-        __m512i result = _mm512_abs_epi32 (a);
-        return 0;
-    }" AVX512F_FOUND)
-endif(NOT WIN32)
+# Check AVX512F
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                  13, -5, 6, -7, 9, 2, -6, 3);
+    __m512i result = _mm512_abs_epi32 (a);
+    return 0;
+}" AVX512F_FOUND)
 
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py
index 8d95dc0591..500f64bed9 100644
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -60,7 +60,7 @@ class Float16Transpiler:
             raise TypeError("place should be as CPUPlace/CUDAPlace type")
         if scope is None:
             scope = global_scope()
-        if not isinstance(scope, core.Scope):
+        if not isinstance(scope, core._Scope):
             raise TypeError("scope should be as Scope type or None")
 
         self.scope = scope
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index b6974c6af2..9872631553 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -464,11 +464,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None))
-paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None
-paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
-paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
-paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
-paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
+paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
 paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
 paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
 paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 867970717b..a167511160 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -7,27 +7,17 @@ function(windows_symbolic TARGET)
   cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH})
   foreach(src ${windows_symbolic_SRCS})
-  get_filename_component(src ${src} NAME_WE)
-  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
-      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
-  endif()
-
-#only copy the xx.cu to.xx.cu when the content are modified
-  set(copy_flag 1)
-  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR)
-  if (SOURCE_STR STREQUAL TARGET_STR)
-    set(copy_flag 0)
-  endif()
-  endif()
-  if (copy_flag)
-  add_custom_command(OUTPUT .${src}.cu
-          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
-          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
-          COMMENT "create hidden file of ${src}.cu")
-  endif(copy_flag)
-  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
+    get_filename_component(src ${src} NAME_WE)
+    if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu)
+        message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+    endif()
+
+    file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc)
+
+    add_custom_command(OUTPUT ${final_path}/.${src}.cu
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu"
+            COMMENT "create hidden file of ${src}.cu")
+    add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
   endforeach()
 endfunction()
 
@@ -37,9 +27,10 @@ add_subdirectory(details)
 proto_library(framework_proto SRCS framework.proto)
 proto_library(async_executor_proto SRCS data_feed.proto)
 
-cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
+cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
@@ -78,17 +69,23 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
 
-cc_test(variable_test SRCS variable_test.cc)
-
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
-cc_library(scope SRCS scope.cc DEPS glog threadpool)
+cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto)
+if (WITH_GPU)
+  target_link_libraries(var_type_traits dynload_cuda)
+endif()
+cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
+
+cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
+cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
+cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
 
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry device_context math_function)
+        DEPS operator op_registry device_context math_function scope)
 
 if(WITH_GPU)
   if (WIN32)
@@ -133,11 +130,9 @@ cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
 
 if(WITH_NGRAPH)
-  if(NOT WIN32)
-    cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-    cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-      shape_inference data_transform lod_tensor profiler ngraph)
-  endif(NOT WIN32)
+  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
+  cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+             shape_inference data_transform lod_tensor profiler)
 endif(WITH_NGRAPH)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@@ -179,11 +174,7 @@ if(WITH_DISTRIBUTE)
 
 else()
   if(WITH_NGRAPH)
-    if(NOT WIN32)
-      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
-    else(NOT WIN32)
-      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-    endif(NOT WIN32)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
   else(WITH_NGRAPH)
     cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
   endif(WITH_NGRAPH)
@@ -193,14 +184,14 @@ endif()
 target_link_libraries(executor garbage_collector)
 
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
-        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
         graph build_strategy
         fast_threaded_ssa_graph_executor variable_helper)
 
 if(WITH_PSLIB)
-    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer)
 else()
-    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
+    cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer)
 endif(WITH_PSLIB)
 
 
diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h
index be9efcd749..b530829868 100644
--- a/paddle/fluid/framework/array.h
+++ b/paddle/fluid/framework/array.h
@@ -15,34 +15,123 @@
 #pragma once
 
 #include <cstdint>
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/framework/unroll_array_ops.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
+
 template <typename T, size_t N>
 class Array {
-  static_assert(N > 0, "The size of array must be larger than 0");
-
  public:
-  HOSTDEVICE Array() {}
+  static constexpr size_t kSize = N;
+
+  HOSTDEVICE inline Array() {}
 
-  HOSTDEVICE explicit Array(const T &val) {
-    for (size_t i = 0; i < N; ++i) data_[i] = val;
+  template <typename... Args>
+  HOSTDEVICE inline explicit Array(const T &val, Args... args) {
+    static_assert(N == sizeof...(Args) + 1, "Invalid argument");
+    UnrollVarArgsAssign<T>::Run(data_, val, args...);
   }
 
-  HOSTDEVICE const T *Get() const { return data_; }
+  HOSTDEVICE inline void Fill(const T &val) {
+    UnrollFillConstant<N>::Run(data_, val);
+  }
 
-  HOSTDEVICE T *GetMutable() { return data_; }
+  HOSTDEVICE inline const T *Get() const { return data_; }
 
-  HOSTDEVICE T &operator[](size_t index) { return data_[index]; }
+  HOSTDEVICE inline T *GetMutable() { return data_; }
 
-  HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; }
+  HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); }
+
+  // Writing "return data_[i]" would cause compilation warning/error:
+  // "array subscript is above array bound" in Python 35 CI.
+  // It seems that it is a false warning of GCC if we do not check the bounds
+  // of array index. But for better performance, we do not check in operator[]
+  // like what is in STL. If users want to check the bounds, use at() instead
+  HOSTDEVICE inline const T &operator[](size_t i) const {
+    return *advance(data_, i);
+  }
+
+  HOSTDEVICE inline T &at(size_t i) {
+#ifndef __CUDA_ARCH__
+    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
+#endif
+    return (*this)[i];
+  }
+
+  HOSTDEVICE inline const T &at(size_t i) const {
+#ifndef __CUDA_ARCH__
+    PADDLE_ENFORCE_LT(i, N, "Array index out of bounds");
+#endif
+    return (*this)[i];
+  }
 
   HOSTDEVICE constexpr size_t size() const { return N; }
 
+  HOSTDEVICE inline bool operator==(const Array<T, N> &other) const {
+    return UnrollCompare<N>::Run(data_, other.data_);
+  }
+
+  HOSTDEVICE inline bool operator!=(const Array<T, N> &other) const {
+    return !(*this == other);
+  }
+
  private:
+  template <typename U>
+  HOSTDEVICE static inline U *advance(U *ptr, size_t i) {
+    return ptr + i;
+  }
+
   T data_[N];
 };
 
+template <typename T>
+class Array<T, 0> {
+ public:
+  static constexpr size_t kSize = 0;
+
+  HOSTDEVICE inline Array() {}
+
+  HOSTDEVICE inline void Fill(const T &val) {}
+
+  HOSTDEVICE inline constexpr T *Get() const { return nullptr; }
+
+  // Add constexpr to GetMutable() cause warning in MAC
+  HOSTDEVICE inline T *GetMutable() { return nullptr; }
+
+  HOSTDEVICE inline T &operator[](size_t) {
+#ifdef __CUDA_ARCH__
+    static T obj();
+    return obj;
+#else
+    PADDLE_THROW("Array<T, 0> has no element");
+#endif
+  }
+
+  HOSTDEVICE inline const T &operator[](size_t) const {
+#ifdef __CUDA_ARCH__
+    static const T obj();
+    return obj;
+#else
+    PADDLE_THROW("Array<T, 0> has no element");
+#endif
+  }
+
+  HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; }
+
+  HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; }
+
+  HOSTDEVICE constexpr size_t size() const { return 0; }
+
+  HOSTDEVICE constexpr bool operator==(const Array<T, 0> &other) const {
+    return true;
+  }
+
+  HOSTDEVICE constexpr bool operator!=(const Array<T, 0> &other) const {
+    return false;
+  }
+};
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index ee3c5e01f8..1d9678a1ba 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
 
   // start executing ops in multiple threads
   for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
-    threads.push_back(
-        std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+    if (debug) {
+      threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
+                                    workers[thidx].get()));
+    } else {
+      threads.push_back(
+          std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+    }
   }
 
   for (auto& th : threads) {
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index c9ec5e7a7b..96a2f9250f 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc
index 05e423b8a5..e7a6df57e5 100644
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -18,312 +18,159 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-/// @cond HIDDEN
-
-template <int i>
-Dim<i> make_dim(const int64_t* d) {
-  return Dim<i>(*d, make_dim<i - 1>(d + 1));
-}
-
-template <>
-Dim<0> make_dim<0>(const int64_t* d) {
-  return Dim<0>(*d);
-}
-
-void make_ddim(DDim& ddim, const int64_t* dims, int n) {
-  switch (n) {
-    case 0:
-      ddim = make_dim<0>(dims);
-      break;
-    case 1:
-      ddim = make_dim<1>(dims);
-      break;
-    case 2:
-      ddim = make_dim<2>(dims);
-      break;
-    case 3:
-      ddim = make_dim<3>(dims);
-      break;
-    case 4:
-      ddim = make_dim<4>(dims);
-      break;
-    case 5:
-      ddim = make_dim<5>(dims);
-      break;
-    case 6:
-      ddim = make_dim<6>(dims);
-      break;
-    case 7:
-      ddim = make_dim<7>(dims);
-      break;
-    case 8:
-      ddim = make_dim<8>(dims);
-      break;
-    case 9:
-      ddim = make_dim<9>(dims);
-      break;
-    default:
-      PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
-  }
-}
-
-/// @endcond
-
 DDim make_ddim(std::initializer_list<int64_t> dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, dims.begin(), dims.size());
-  return result;
+  return DDim(dims.begin(), dims.size());
 }
 
 DDim make_ddim(const std::vector<int64_t>& dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, &dims[0], dims.size());
-  return result;
+  return DDim(dims.data(), dims.size());
 }
 
 DDim make_ddim(const std::vector<int>& dims) {
-  std::vector<int64_t> res(dims.size());
-  std::transform(dims.begin(), dims.end(), res.begin(),
-                 [](int d) { return static_cast<int64_t>(d); });
-  return make_ddim(res);
+  return DDim(dims.data(), dims.size());
 }
 
-/// @cond HIDDEN
-// XXX For some reason, putting this in an anonymous namespace causes errors
-class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
- public:
-  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
+struct DDimEqualityVisitor {
+  explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {}
 
   template <int D>
-  int64_t& operator()(Dim<D>& dim) const {
-    return dim[idx_];
+  inline bool operator()(const Dim<D>& self) const {
+    return UnrollCompare<D>::Run(self.Get(), d_);
   }
 
- private:
-  int idx_;
+  const int64_t* d_;
 };
 
-class DynamicConstIndexer : public boost::static_visitor<int64_t> {
- public:
-  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int64_t operator()(const Dim<D>& dim) const {
-    return dim[idx_];
-  }
-
- private:
-  int idx_;
-};
-
-/// @endcond
-
-int64_t& DDim::operator[](int idx) {
-  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
+bool DDim::operator==(const DDim& d) const {
+  return size() == d.size() &&
+         this->apply_visitor(DDimEqualityVisitor(d.Get()));
 }
 
-int64_t DDim::operator[](int idx) const {
-  return boost::apply_visitor(DynamicConstIndexer(idx), var);
-}
+bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
 
-int DDim::size() const { return arity(*this); }
+struct DDimPlusVisitor {
+  explicit DDimPlusVisitor(const int64_t* d1, const int64_t* d2)
+      : d1_(d1), d2_(d2) {}
 
-bool DDim::operator==(DDim d) const {
-  if (var.which() != d.getVar().which()) {
-    return false;
-  } else {
-    std::vector<int64_t> v1 = vectorize(*this);
-    std::vector<int64_t> v2 = vectorize(d);
-
-    for (unsigned int i = 0; i < v1.size(); i++) {
-      if (v1[i] != v2[i]) {
-        return false;
-      }
-    }
-
-    return true;
+  template <int D>
+  inline void operator()(Dim<D>& self) const {
+    UnrollAdd<D>::Run(d1_, d2_, self.GetMutable());
   }
-}
-
-bool DDim::operator!=(DDim d) const { return !(*this == d); }
-
-DDim DDim::operator+(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;
 
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] + v2[i]);
-  }
+  const int64_t* d1_;
+  const int64_t* d2_;
+};
 
-  return make_ddim(v3);
+DDim DDim::operator+(const DDim& d) const {
+  PADDLE_ENFORCE(size() == d.size());
+  DDim ret;
+  ret.rank_ = rank_;
+  ret.apply_visitor(DDimPlusVisitor(Get(), d.Get()));
+  return ret;
 }
 
-DDim DDim::operator*(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
+struct DDimMulVisitor {
+  explicit DDimMulVisitor(const int64_t* d1, const int64_t* d2)
+      : d1_(d1), d2_(d2) {}
 
-  std::vector<int64_t> v3;
-
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] * v2[i]);
+  template <int D>
+  inline void operator()(Dim<D>& self) const {
+    UnrollMul<D>::Run(d1_, d2_, self.GetMutable());
   }
 
-  return make_ddim(v3);
+  const int64_t* d1_;
+  const int64_t* d2_;
+};
+
+DDim DDim::operator*(const DDim& d) const {
+  PADDLE_ENFORCE(size() == d.size());
+  DDim ret;
+  ret.rank_ = rank_;
+  ret.apply_visitor(DDimMulVisitor(Get(), d.Get()));
+  return ret;
 }
 
 int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
 
-void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
-
-/// @cond HIDDEN
-struct VectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int64_t>& vector;
-
-  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
-
-  template <typename T>
-  void operator()(const T& t) {
-    vector.push_back(t.head);
-    this->operator()(t.tail);
-  }
-
-  void operator()(const Dim<0>& t) {}
-};
-/// @endcond
+void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }  // NOLINT
 
 std::vector<int64_t> vectorize(const DDim& ddim) {
-  std::vector<int64_t> result;
-  VectorizeVisitor visitor(result);
-  boost::apply_visitor(visitor, ddim);
+  std::vector<int64_t> result(DDim::kMaxRank);
+  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
+  result.resize(ddim.size());
   return result;
 }
 
 // NOTE: framework::vectorize converts to type int64_t
 //       which does not fit cudnn inputs.
 std::vector<int> vectorize2int(const DDim& ddim) {
-  std::vector<int64_t> temp = vectorize(ddim);
-  std::vector<int> result(temp.begin(), temp.end());
+  std::vector<int> result(DDim::kMaxRank);
+  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
+  result.resize(ddim.size());
   return result;
 }
 
-struct ProductVisitor : public boost::static_visitor<int64_t> {
+struct ProductVisitor {
   template <int D>
-  int64_t operator()(const Dim<D>& dim) {
+  inline int64_t operator()(const Dim<D>& dim) {
     return product(dim);
   }
 };
 
 int64_t product(const DDim& ddim) {
-  ProductVisitor visitor;
-  return boost::apply_visitor(visitor, ddim);
+  return ddim.apply_visitor(ProductVisitor());
 }
 
-struct SliceVectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int64_t>& vector;
-  int begin;
-  int end;
-
-  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
-      : vector(v), begin(b), end(e) {
-    PADDLE_ENFORCE(begin < end,
-                   "Begin index must be less than end index in ddim slice.");
-    PADDLE_ENFORCE(begin >= 0,
-                   "Begin index can't be less than zero in ddim slice.");
-  }
-
-  template <int S>
-  void operator()(const Dim<S>& dim) {
-    if (begin == 0) {
-      vector.push_back(dim.head);
-    } else {
-      --begin;
-    }
-    --end;
-    if (end > 0) {
-      this->operator()(dim.tail);
-    }
-  }
-
-  void operator()(const Dim<0>& dim) {
-    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
-  }
-};
-
 DDim slice_ddim(const DDim& dim, int begin, int end) {
-  std::vector<int64_t> vec;
-  vec.reserve(end - begin);
-  SliceVectorizeVisitor visitor(vec, begin, end);
-  boost::apply_visitor(visitor, dim);
-  return make_ddim(vec);
+  PADDLE_ENFORCE(begin >= 0 && end <= dim.size(),
+                 "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
+                 begin, end, dim.size());
+  // Constructor of DDim would check whether end - begin is valid
+  return DDim(dim.Get() + begin, end - begin);
 }
 
-/// \cond HIDDEN
-
-struct ArityVisitor : boost::static_visitor<int> {
-  template <int D>
-  int operator()(Dim<D>) const {
-    return D;
-  }
-};
-
-/// \endcond
-
-int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
+int arity(const DDim& d) { return d.size(); }
 
-/// \cond HIDDEN
-
-struct DDimPrinter : boost::static_visitor<void> {
+struct DDimPrinter {
   std::ostream& os;
   explicit DDimPrinter(std::ostream& os_) : os(os_) {}
 
-  template <typename T>
-  void operator()(const T& t) {
+  template <int D>
+  void operator()(const Dim<D>& t) {
     os << t;
   }
 };
 
-/// \endcond
-
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
-  DDimPrinter printer(os);
-  boost::apply_visitor(printer, ddim);
+  ddim.apply_visitor(DDimPrinter(os));
   return os;
 }
 
-DDim::DDim(std::initializer_list<int64_t> init_list) {
-  *this = make_ddim(init_list);
-}
-
 DDim flatten_to_2d(const DDim& src, int num_col_dims) {
-  int rank = src.size();
-  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
-                    product(slice_ddim(src, num_col_dims, rank))});
+  return DDim({product(slice_ddim(src, 0, num_col_dims)),
+               product(slice_ddim(src, num_col_dims, src.size()))});
 }
 
-DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
+DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); }
 
 DDim stride(const DDim& ddim) {
-  std::vector<int64_t> strides(ddim.size());
+  DDim strides;
+  strides.rank_ = ddim.size();
   strides[ddim.size() - 1] = 1;
   for (int i = ddim.size() - 2; i >= 0; --i) {
     strides[i] = strides[i + 1] * ddim[i + 1];
   }
-  return framework::make_ddim(strides);
+  return strides;
 }
 
-DDim stride_numel(const framework::DDim& ddim) {
-  std::vector<int64_t> strides(ddim.size());
+DDim stride_numel(const DDim& ddim) {
+  DDim strides;
+  strides.rank_ = ddim.size();
   strides[ddim.size() - 1] = ddim[ddim.size() - 1];
   for (int i = ddim.size() - 2; i >= 0; --i) {
     strides[i] = strides[i + 1] * ddim[i];
   }
-  return framework::make_ddim(strides);
+  return strides;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
index f05b5ee3fa..31a41dab2a 100644
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -18,62 +18,145 @@ limitations under the License. */
 #include <stdexcept>
 #include <vector>
 #include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
 
+#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
+  case (rank): {                               \
+    constexpr auto kRank = (rank);             \
+    return (callback);                         \
+  }
+
+#define PADDLE_VISIT_DDIM(rank, callback)    \
+  switch (rank) {                            \
+    PADDLE_VISIT_DDIM_BASE(0, callback);     \
+    PADDLE_VISIT_DDIM_BASE(1, callback);     \
+    PADDLE_VISIT_DDIM_BASE(2, callback);     \
+    PADDLE_VISIT_DDIM_BASE(3, callback);     \
+    PADDLE_VISIT_DDIM_BASE(4, callback);     \
+    PADDLE_VISIT_DDIM_BASE(5, callback);     \
+    PADDLE_VISIT_DDIM_BASE(6, callback);     \
+    PADDLE_VISIT_DDIM_BASE(7, callback);     \
+    PADDLE_VISIT_DDIM_BASE(8, callback);     \
+    PADDLE_VISIT_DDIM_BASE(9, callback);     \
+    default:                                 \
+      PADDLE_THROW("Invalid rank %d", rank); \
+  }
+
+template <typename T1, typename T2>
+inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
+  PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
+}
+
 /**
  * \brief A dynamically sized dimension.
  *
  * The number of dimensions must be between [1, 9].
  */
-struct DDim {
-  typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
-                         Dim<7>, Dim<8>, Dim<9>>
-      DDimVar;
-  DDimVar var;
+class DDim {
+ public:
+  constexpr static int kMaxRank = 9;
+
+  DDim() : rank_(1) { dim_[0] = 0; }
 
-  DDim() : var(Dim<1>()) {}
+  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
+
+  DDim(const int* d, int n) : rank_(n) {
+    dynamic_dim_assign(d, dim_.GetMutable(), n);
+  }
+
+  DDim(const int64_t* d, int n) : rank_(n) {
+    dynamic_dim_assign(d, dim_.GetMutable(), n);
+  }
 
   template <int D>
-  explicit DDim(const Dim<D>& in) : var(in) {}
+  /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
+    UnsafeCast<D>() = in;
+  }
+
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
+      : DDim(init_list.begin(), init_list.size()) {}
 
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
+  inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
 
   template <int D>
-  DDim& operator=(const Dim<D>& in) {
-    var = in;
+  inline DDim& operator=(const Dim<D>& dim) {
+    rank_ = D;
+    UnsafeCast<D>() = dim;
     return *this;
   }
 
-  int64_t& operator[](int idx);
-  int64_t operator[](int idx) const;
+  inline int64_t& operator[](int idx) { return dim_[idx]; }
+
+  inline int64_t operator[](int idx) const { return dim_[idx]; }
+
+  inline int64_t& at(int idx) {
+    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
+    return dim_[idx];
+  }
+
+  inline int64_t at(int idx) const {
+    PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx);
+    return dim_[idx];
+  }
 
   template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) {
-    return var.apply_visitor(visitor);
+  typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
+      Visitor&& visitor) {
+    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
   }
 
   template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
-    return var.apply_visitor(visitor);
+  typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
+      Visitor&& visitor) const {
+    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
   }
 
-  DDimVar getVar() { return var; }
+  bool operator==(const DDim& d) const;
+
+  bool operator!=(const DDim& d) const;
+
+  DDim operator+(const DDim& d) const;
 
-  bool operator==(DDim d) const;
+  DDim operator*(const DDim& d) const;
 
-  bool operator!=(DDim d) const;
+  inline const int64_t* Get() const { return dim_.Get(); }
 
-  DDim operator+(DDim d) const;
+  inline int64_t* GetMutable() { return dim_.GetMutable(); }
 
-  DDim operator*(DDim d) const;
+  inline int size() const { return rank_; }
+
+ private:
+  template <int D>
+  inline Dim<D>& UnsafeCast() {
+    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<void*>(&dim_);
+    return *reinterpret_cast<Dim<D>*>(p);
+  }
+
+  template <int D>
+  inline const Dim<D>& UnsafeCast() const {
+    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<const void*>(&dim_);
+    return *reinterpret_cast<const Dim<D>*>(p);
+  }
 
-  int size() const;
+  inline DDim& CopyFrom(const DDim& ddim) {
+    PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
+  }
+
+  friend DDim stride(const DDim& ddim);
+  friend DDim stride_numel(const DDim& ddim);
+
+ private:
+  Dim<kMaxRank> dim_;
+  int rank_;
 };
 
+#undef PADDLE_VISIT_DDIM_BASE
+#undef PADDLE_VISIT_DDIM
+
 /**
  * \brief Make a DDim from std::vector<int64_t>
  *
@@ -92,7 +175,7 @@ DDim make_ddim(const std::vector<int>& dims);
 DDim make_ddim(std::initializer_list<int64_t> dims);
 
 int64_t get(const DDim& dim, int idx);
-void set(DDim& dim, int idx, int val);
+void set(DDim& dim, int idx, int val);  // NOLINT
 
 std::vector<int64_t> vectorize(const DDim& ddim);
 std::vector<int> vectorize2int(const DDim& ddim);
@@ -129,12 +212,3 @@ DDim stride(const DDim& ddim);
 DDim stride_numel(const DDim& ddim);
 }  // namespace framework
 }  // namespace paddle
-
-namespace boost {
-
-template <typename T>
-T get(const paddle::framework::DDim& in) {
-  return boost::get<T>(in.var);
-}
-
-}  // namespace boost
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 63a68ba3a5..179aa14528 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -77,6 +77,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
 
+cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
+
 cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 9eaff1f560..a24e3d3e48 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -19,6 +19,13 @@
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"
 
+// asynchronous nccl allreduce or synchronous issue:
+// https://github.com/PaddlePaddle/Paddle/issues/15049
+DEFINE_bool(
+    sync_nccl_allreduce, false,
+    "If set true, will call `cudaStreamSynchronize(nccl_stream)`"
+    "after allreduce, this mode can get better performance in some scenarios.");
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -48,100 +55,104 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 void AllReduceOpHandle::RunImpl() {
   platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
 
-// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
-// this is a distributed or inter-process call, find a better way.
-#ifdef PADDLE_WITH_CUDA
-  if (NoDummyInputSize() == 1 &&
-      local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
-#else
-  if (NoDummyInputSize() == 1) {
-#endif
-    return;  // No need to all reduce when GPU count = 1;
-  } else {
-    // Wait input done
-    WaitInputVarGenerated();
-    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-    PADDLE_ENFORCE_EQ(
-        in_var_handles.size(), places_.size(),
-        "The NoDummyInputSize should be equal to the number of places.");
-    PADDLE_ENFORCE_EQ(
-        in_var_handles.size(), out_var_handles.size(),
-        "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-
-    std::vector<const LoDTensor *> lod_tensors;
-    for (size_t i = 0; i < local_scopes_.size(); ++i) {
-      auto *s = local_scopes_[i];
-      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
-      auto &lod_tensor =
-          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
-      lod_tensors.emplace_back(&lod_tensor);
-      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
-                        "The name of input and output should be equal.");
-    }
+  WaitInputVarGenerated();
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+
+  std::vector<const LoDTensor *> lod_tensors;
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto *s = local_scopes_[i];
+    auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto &lod_tensor =
+        local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
+    lod_tensors.emplace_back(&lod_tensor);
+    PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                      "The name of input and output should be equal.");
+  }
 
-    if (platform::is_gpu_place(lod_tensors[0]->place())) {
+  if (platform::is_gpu_place(lod_tensors[0]->place())) {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
-      int dtype = -1;
-      size_t numel = 0;
-      std::vector<std::function<void()>> all_reduce_calls;
-      for (size_t i = 0; i < local_scopes_.size(); ++i) {
-        auto &p = places_[i];
-        auto &lod_tensor = *lod_tensors[i];
-        void *buffer = const_cast<void *>(lod_tensor.data<void>());
-
-        if (dtype == -1) {
-          dtype = platform::ToNCCLDataType(lod_tensor.type());
-        }
+    PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    int dtype = -1;
+    size_t numel = 0;
+    std::vector<std::function<void()>> all_reduce_calls;
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      auto &p = places_[i];
+      auto &lod_tensor = *lod_tensors[i];
+      void *buffer = const_cast<void *>(lod_tensor.data<void>());
 
-        if (numel == 0) {
-          numel = static_cast<size_t>(lod_tensor.numel());
-        }
+      if (dtype == -1) {
+        dtype = platform::ToNCCLDataType(lod_tensor.type());
+      }
 
-        int dev_id = boost::get<platform::CUDAPlace>(p).device;
-        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
-        auto stream = nccl_ctx.stream();
-        auto comm = nccl_ctx.comm_;
-        all_reduce_calls.emplace_back([=] {
-          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
-              ncclSum, comm, stream));
-        });
+      if (numel == 0) {
+        numel = static_cast<size_t>(lod_tensor.numel());
       }
-      this->RunAndRecordEvent([&] {
+
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
+      all_reduce_calls.emplace_back([=] {
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
+            comm, stream));
+      });
+    }
+
+    this->RunAndRecordEvent([&] {
+      if (all_reduce_calls.size() == 1UL) {
+        // Do not use NCCLGroup when manage NCCL by per thread per device
+        all_reduce_calls[0]();
+      } else {
         platform::NCCLGroupGuard guard;
         for (auto &call : all_reduce_calls) {
           call();
         }
-      });
+      }
+    });
+
+    if (FLAGS_sync_nccl_allreduce) {
+      for (auto &p : places_) {
+        int dev_id = boost::get<platform::CUDAPlace>(p).device;
+        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+        auto stream = nccl_ctx.stream();
+        cudaStreamSynchronize(stream);
+      }
+    }
+
 #else
-      PADDLE_THROW("Not compiled with CUDA");
+    PADDLE_THROW("Not compiled with CUDA");
 #endif
-    } else {  // Special handle CPU only Operator's gradient. Like CRF
-      auto &trg = *this->local_scopes_[0]
-                       ->FindVar(kLocalExecScopeName)
-                       ->Get<Scope *>()
-                       ->FindVar(out_var_handles[0]->name_)
-                       ->GetMutable<framework::LoDTensor>();
-
-      // Reduce All Tensor to trg in CPU
-      ReduceLoDTensor func(lod_tensors, &trg);
-      VisitDataType(lod_tensors[0]->type(), func);
-
-      for (size_t i = 1; i < local_scopes_.size(); ++i) {
-        auto &scope =
-            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-        auto &p = places_[i];
-        auto *var = scope.FindVar(out_var_handles[i]->name_);
-        auto *dev_ctx = dev_ctxes_.at(p);
-
-        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
-          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
-          auto &tensor_cpu = trg;
-          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
-        });
-      }
+  } else {  // Special handle CPU only Operator's gradient. Like CRF
+    auto &trg = *this->local_scopes_[0]
+                     ->FindVar(kLocalExecScopeName)
+                     ->Get<Scope *>()
+                     ->FindVar(out_var_handles[0]->name_)
+                     ->GetMutable<framework::LoDTensor>();
+
+    // Reduce All Tensor to trg in CPU
+    ReduceLoDTensor func(lod_tensors, &trg);
+    VisitDataType(lod_tensors[0]->type(), func);
+
+    for (size_t i = 1; i < local_scopes_.size(); ++i) {
+      auto &scope =
+          *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+      auto &p = places_[i];
+      auto *var = scope.FindVar(out_var_handles[i]->name_);
+      auto *dev_ctx = dev_ctxes_.at(p);
+
+      RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
+        auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
+        auto &tensor_cpu = trg;
+        TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
+      });
     }
   }
 }
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 389366a8a9..a68b69e026 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/fluid/framework/details/memory_reuse_types.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
@@ -31,7 +31,11 @@ namespace framework {
 namespace details {
 
 static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
-  return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1);
+  // Should fix the allreduce op order if scheduling
+  // them in multiple threads or processes to avoid hang.
+  return (!strategy.enable_sequential_execution_ &&
+          strategy.num_trainers_ > 1) ||
+         strategy.enable_parallel_graph_;
 }
 
 class ParallelExecutorPassBuilder : public ir::PassBuilder {
@@ -67,7 +71,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     context->endpoints_ = strategy_.trainers_endpoints_;
     context->trainer_id_ = strategy_.trainer_id_;
     PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0");
-    if (strategy_.trainer_id_ > 0) {
+    if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) {
       PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) <
                          strategy_.trainers_endpoints_.size(),
                      "trainer_id_ < endpoints_ size");
@@ -82,12 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     if (strategy.memory_optimize_) {
       auto analysis_var_pass = AppendPass("analysis_var_pass");
     }
-    // Convert graph to run on multi-devices.
-    auto multi_devices_pass = AppendPass("multi_devices_pass");
-    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
-                                                         &strategy_);
-    multi_devices_pass->Set<int>("num_trainers",
-                                 new int(strategy_.num_trainers_));
+
+    AppendMultiDevPass(strategy);
 
     // Add a graph print pass to record a graph with device info.
     if (!strategy_.debug_graphviz_path_.empty()) {
@@ -113,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     }
   }
 
+  // Convert graph to run on multi-devices.
+  void AppendMultiDevPass(const BuildStrategy &strategy) {
+    ir::Pass *multi_devices_pass;
+    if (strategy_.is_distribution_) {
+      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
+    } else {
+      if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+        multi_devices_pass =
+            AppendPass("allreduce_mode_multi_devices_pass").get();
+      } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
+        multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
+      } else {
+        PADDLE_THROW("Unknown reduce strategy.");
+      }
+    }
+    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
+                                                         &strategy_);
+  }
+
  private:
   BuildStrategy strategy_;
 };
@@ -129,9 +148,14 @@ std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
   return pass_builder_;
 }
 
+bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
+  return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0;
+}
+
 std::unique_ptr<ir::Graph> BuildStrategy::Apply(
     const ProgramDesc &main_program, const std::vector<platform::Place> &places,
     const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
+    const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
 #else
@@ -142,19 +166,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
-    if (pass->Type() == "multi_devices_pass") {
-      pass->Erase("places");
-      pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
-      pass->Erase("loss_var_name");
-      pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
-      pass->Erase("local_scopes");
-      pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+    if (IsMultiDevPass(pass->Type())) {
+      pass->Erase(kPlaces);
+      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
+      pass->Erase(kLossVarName);
+      pass->SetNotOwned<const std::string>(kLossVarName, &loss_var_name);
+      pass->Erase(kLocalScopes);
+      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                     &local_scopes);
+      pass->Erase(kNRanks);
+      pass->Set<size_t>(kNRanks, new size_t(nranks));
+
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
       pass->Erase("nccl_ctxs");
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
+
     } else if (pass->Type() == "analysis_var_pass") {
       const std::vector<OpDesc *> *all_op_descs =
           new std::vector<OpDesc *>(main_program.Block(0).AllOps());
@@ -195,7 +223,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
-USE_PASS(multi_devices_pass);
+USE_PASS(reduce_mode_multi_devices_pass);
+USE_PASS(allreduce_mode_multi_devices_pass);
+USE_PASS(dist_multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
 USE_PASS(analysis_var_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 11db184cb4..15c2e01b61 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -74,8 +74,6 @@ struct BuildStrategy {
 
   bool fuse_elewise_add_act_ops_{false};
 
-  bool enable_data_balance_{false};
-
   bool memory_optimize_{false};
 
   bool memory_early_delete_{false};
@@ -84,6 +82,10 @@ struct BuildStrategy {
 
   bool fuse_broadcast_op_{false};
 
+  // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
+  // num_trainers is 1, so the current fields of build_strategy doesn't tell if
+  // it's distributed model.
+  bool is_distribution_{false};
   int num_trainers_{1};
   int trainer_id_{0};
   std::vector<std::string> trainers_endpoints_;
@@ -104,12 +106,15 @@ struct BuildStrategy {
 
   bool IsFinalized() const { return is_finalized_; }
 
+  bool IsMultiDevPass(const std::string &pass_name) const;
+
   // Apply the passes built by the pass_builder_. The passes will be
   // applied to the Program and output an ir::Graph.
   std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
                                    const std::vector<platform::Place> &places,
                                    const std::string &loss_var_name,
                                    const std::vector<Scope *> &local_scopes,
+                                   const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
                                    const bool use_cuda,
                                    platform::NCCLContextMap *nccl_ctxs) const;
@@ -117,6 +122,13 @@ struct BuildStrategy {
                                    const bool use_cuda) const;
 #endif
 
+  // If set true, ParallelExecutor would build the main_program into multiple
+  // graphs,
+  // each of the graphs would run with one device. This approach can achieve
+  // better performance
+  // on some scenarios.
+  mutable bool enable_parallel_graph_ = false;
+
  private:
   mutable bool is_finalized_ = false;
   mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index abacb11e3b..03fbfd7f24 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() {
       }
     } else {
       PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   var->Type().name(), name);
+                   framework::ToTypeName(var->Type()), name);
     }
   }
 
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 15c496130c..37b07e5736 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,7 +25,7 @@ struct ExecutionStrategy {
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
-  size_t num_iteration_per_drop_scope_{100};
+  size_t num_iteration_per_drop_scope_{1};
   ExecutorType type_{kDefault};
   bool dry_run_{false};
 };
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 949510e037..872bc5d654 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   ClearFetchOp(graph_.get(), &fetch_ops);
   return fetches;
 }
+
 void FastThreadedSSAGraphExecutor::RunOpAsync(
     std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
     OpHandleBase *op,
diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
index c8ea188046..a4bb1e26d9 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include <string>
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
@@ -21,68 +21,78 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
-  std::unordered_map<OpHandleBase *, size_t> pending_ops;
-  std::unordered_set<VarHandleBase *> pending_vars;
-  std::unordered_set<VarHandleBase *> ready_vars;
-  std::unordered_set<OpHandleBase *> ready_ops;
+class SSAGraghBuilderWithChecker : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
+  }
 
-  auto insert_pending_var = [&](VarHandleBase *var) {
-    pending_vars.insert(var);
-    if (var->GeneratedOp() == nullptr) {
-      ready_vars.emplace(var);
-    }
-  };
+  bool IsValidGraph(const ir::Graph *graph) const {
+    std::unordered_map<OpHandleBase *, size_t> pending_ops;
+    std::unordered_set<VarHandleBase *> pending_vars;
+    std::unordered_set<VarHandleBase *> ready_vars;
+    std::unordered_set<OpHandleBase *> ready_ops;
 
-  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
-    for (auto &name_pair : var_map) {
-      for (auto &version_pair : name_pair.second) {
-        insert_pending_var(version_pair);
+    auto insert_pending_var = [&](VarHandleBase *var) {
+      pending_vars.insert(var);
+      if (var->GeneratedOp() == nullptr) {
+        ready_vars.emplace(var);
       }
-    }
-  }
+    };
 
-  for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
-    insert_pending_var(var);
-  }
+    for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
+      for (auto &name_pair : var_map) {
+        for (auto &version_pair : name_pair.second) {
+          insert_pending_var(version_pair);
+        }
+      }
+    }
 
-  for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
-    if (op->Inputs().empty()) {
-      ready_ops.insert(op);
-    } else {
-      pending_ops.insert({op, op->NoDupInputSize()});
+    for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
+      insert_pending_var(var);
     }
-  }
 
-  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
-    for (auto *op : set) {
-      for (auto out : op->Outputs()) {
-        ready_vars.emplace(out);
+    for (OpHandleBase *op : ir::FilterByNodeWrapper<OpHandleBase>(*graph)) {
+      if (op->Inputs().empty()) {
+        ready_ops.insert(op);
+      } else {
+        pending_ops.insert({op, op->NoDupInputSize()});
       }
     }
-    set.clear();
-  };
 
-  while (!pending_vars.empty()) {
-    run_all_ops(ready_ops);
+    auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+      for (auto *op : set) {
+        for (auto out : op->Outputs()) {
+          ready_vars.emplace(out);
+        }
+      }
+      set.clear();
+    };
 
-    if (ready_vars.empty()) {
-      return false;
-    }
+    while (!pending_vars.empty()) {
+      run_all_ops(ready_ops);
 
-    for (auto ready_var : ready_vars) {
-      pending_vars.erase(ready_var);
-      for (auto *op : ready_var->PendingOps()) {
-        auto &deps = --pending_ops[op];
-        if (deps == 0) {
-          ready_ops.insert(op);
+      if (ready_vars.empty()) {
+        return false;
+      }
+
+      for (auto ready_var : ready_vars) {
+        pending_vars.erase(ready_var);
+        for (auto *op : ready_var->PendingOps()) {
+          auto &deps = --pending_ops[op];
+          if (deps == 0) {
+            ready_ops.insert(op);
+          }
         }
       }
+      ready_vars.clear();
     }
-    ready_vars.clear();
+    return true;
   }
-  return true;
-}
+};
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 7e320a0894..d91993bd4f 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -42,6 +42,12 @@ namespace {
 typedef std::vector<OpHandleBase *> GraphOps;
 const char kGraphOps[] = "ops";
 
+bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) {
+  return boost::get<int>(
+             node.Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+         static_cast<int>(role);
+}
+
 void PolishGraphToSupportDataHazards(ir::Graph *graph) {
   for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
@@ -128,15 +134,8 @@ void AddOutputToLeafOps(ir::Graph *graph) {
 }
 }  // namespace
 
-static const char kLossVarName[] = "loss_var_name";
-static const char kPlaces[] = "places";
-static const char kLocalScopes[] = "local_scopes";
-static const char kStrategy[] = "strategy";
-static const char kNumTrainers[] = "num_trainers";
-
-void MultiDevSSAGraphBuilder::Init() const {
+void MultiDevSSAGraphBuilderBase::Init() const {
   all_vars_.clear();
-  balance_vars_.clear();
 
   loss_var_name_ = Get<const std::string>(kLossVarName);
   places_ = Get<const std::vector<platform::Place>>(kPlaces);
@@ -145,320 +144,82 @@ void MultiDevSSAGraphBuilder::Init() const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
-
-  balance_vars_.resize(places_.size(), 0);
-  if (strategy_.enable_data_balance_ && places_.size() == 1) {
-    LOG(WARNING) << "It is no need to enable data balance when there is only "
-                    "one place. enable_data_balance is set to False.";
-    strategy_.enable_data_balance_ = false;
-  }
-}
-
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
-                                                ir::Node *node,
-                                                size_t place_id) const {
-  auto p = places_[place_id];
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
-  op_handle->SetDeviceContext(p,
-                              platform::DeviceContextPool::Instance().Get(p));
-
-  for (ir::Node *input : node->inputs) {
-    VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id);
-    op_handle->AddInput(var);
-  }
-
-  for (ir::Node *output : node->outputs) {
-    ir::Node *new_node = nullptr;
-    if (output->Var()) {
-      new_node = result->CreateVarNode(output->Var());
-    } else {
-      new_node =
-          result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
-    }
-    CreateOpOutput(result, op_handle, new_node, p, place_id);
-  }
-}
-
-std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
-    const std::vector<ir::Node *> &nodes) const {
-  std::vector<std::string> send_vars;
-  // since parameters are all in block 0,
-  // it's enough to only scan send ops in block 0
-  for (auto &node : nodes) {
-    OpDesc *op = node->Op();
-    // TODO(Yancey1989): use a graceful method to find send op,
-    // instead of the the hard code string
-    if (op->Type() == "send") {
-      auto op_vars = op->InputArgumentNames();
-      send_vars.reserve(send_vars.size() +
-                        std::distance(op_vars.begin(), op_vars.end()));
-      send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
-    }
-  }
-  return send_vars;
-}
-
-std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
-    const std::vector<ir::Node *> &nodes) const {
-  std::vector<std::string> recv_vars;
-  for (auto &node : nodes) {
-    OpDesc *op = node->Op();
-    // TODO(Yancey1989): use a graceful method to find recv op,
-    // instead of the hard code string
-    if (op->Type() == "recv") {
-      auto op_vars = op->OutputArgumentNames();
-      recv_vars.reserve(recv_vars.size() +
-                        std::distance(op_vars.begin(), op_vars.end()));
-      recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
-    }
-  }
-  return recv_vars;
-}
-
-size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
-    const std::vector<std::string> &var_names) const {
-  int64_t numel_sum = 0;
-  for (auto var_name : var_names) {
-    if (all_vars_.find(var_name) == all_vars_.end()) continue;
-    auto var_desc = all_vars_.at(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
-    auto dim = framework::make_ddim(var_desc->GetShape());
-    int64_t numel = framework::product(dim);
-    PADDLE_ENFORCE_GT(numel, 0);
-    numel_sum += numel;
-  }
-
-  auto smallest =
-      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
-  size_t dev_id =
-      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
-  balance_vars_[dev_id] += numel_sum;
-  return dev_id;
-}
-
-// Topology sort the graph nodes from inputs to outputs.
-// Since SSAGraphBuilder depends on forward/backward nodes to assign devices
-// to parameter/gradients before optimizer ops, topo sort is insufficient. (
-// some optimizer ops might not depend on any nodes), we manually move all
-// optimizer nodes after last backward nodes.
-// However, the assumption by SSAGraphBuilder should be relaxed in the future.
-std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
-  std::vector<ir::Node *> ret = ir::TopologySortOperations(graph);
-  size_t last_backward = 0;
-  for (size_t i = 0; i < ret.size(); ++i) {
-    if (boost::get<int>(
-            ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-        static_cast<int>(OpRole::kBackward)) {
-      last_backward = i;
-    }
-  }
-
-  std::vector<ir::Node *> optimize_ops;
-  std::vector<ir::Node *> sorted_ret;
-  for (size_t i = 0; i < ret.size(); ++i) {
-    if (i < last_backward) {
-      if (static_cast<bool>(boost::get<int>(ret[i]->Op()->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kOptimize))) {
-        optimize_ops.push_back(ret[i]);
-      } else {
-        sorted_ret.push_back(ret[i]);
-      }
-    } else if (i == last_backward) {
-      sorted_ret.push_back(ret[i]);
-      // Verify that no operations before optimize ops depends on optimize ops.
-      std::unordered_set<ir::Node *> optimize_set(optimize_ops.begin(),
-                                                  optimize_ops.end());
-      for (ir::Node *n : sorted_ret) {
-        for (ir::Node *in : n->inputs) {
-          for (ir::Node *pre_n : in->inputs) {
-            PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(),
-                           "optimize operations cannot be depended by forward "
-                           "or backward node %s -> %s",
-                           pre_n->Name(), n->Name());
-          }
-        }
-      }
-      sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(),
-                        optimize_ops.end());
-    } else {
-      sorted_ret.push_back(ret[i]);
-    }
-  }
-  return sorted_ret;
 }
 
-std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
+std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   Init();
-  // Give the topology sort order and rebuild the graph structure.
-  std::vector<ir::Node *> sorted_ops = SortOpsAndDelayOptimizeOp(*graph);
+  std::vector<ir::Node *> sorted_ops = SortOperations(*graph);
+
   auto nodes = graph->ReleaseNodes();
   ir::Graph &result = *graph;
 
-  int num_trainers = Get<int>(kNumTrainers);
-
   for (auto &node : nodes) {
     if (node->IsVar() && node->Var()) {
       all_vars_.emplace(node->Name(), node->Var());
     }
   }
-  std::unordered_set<std::string> og_has_been_broadcast;
 
   // We cannot invoke resize. It is a bug of GCC 4.8
   result.Set(kGraphVars, new GraphVars(places_.size()));
   result.Set(kGraphDepVars, new GraphDepVars);
   result.Set(kGraphOps, new GraphOps);
 
-  // find send/recv vars so that we can place the distributed training
-  // related op in the place 0
-  auto send_vars = FindDistTrainSendVars(sorted_ops);
-  auto recv_vars = FindDistTrainRecvVars(sorted_ops);
-
-  std::vector<std::unordered_set<std::string>> bcast_var_name_set;
-  bcast_var_name_set.resize(places_.size());
-
-  size_t cur_device_id = 0;
   bool is_forwarding = true;
-  bool is_dist_train = false;
-
-  std::unordered_map<std::string, int> sharded_var_device;
+  bool insert_collection_ops = NeedCollectiveOps();
 
   for (ir::Node *node : sorted_ops) {
-    if (boost::get<int>(
-            node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-        static_cast<int>(OpRole::kRPC)) {
-      int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device);
-      PADDLE_ENFORCE(op_dev_id != -1,
-                     "Can not schedule the RPC operator to the right place.");
-      if (node->Op()->Type() == "recv") {
-        auto recv_vars_attr =
-            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-        PADDLE_ENFORCE(recv_vars_attr.size() == 2UL);  // [parameter, gradient]
-        if (recv_vars_attr[0].find(".block") == std::string::npos) {
-          bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
-        }
-      }
-      is_dist_train = true;
-    } else if (boost::get<int>(node->Op()->GetAttr(
-                   OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-               static_cast<int>(OpRole::kDist)) {
-      int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device);
-      if (node->Op()->Type() == "concat") {
-        auto origin_param_name = node->Op()->OutputArgumentNames()[0];
-        bcast_var_name_set[op_dev_id].emplace(origin_param_name);
-      }
-    } else if (IsScaleLossOp(node)) {
-      // user can customize loss@grad if not use_default_grad_scale_
-      if (strategy_.gradient_scale_ !=
-          BuildStrategy::GradientScaleStrategy::kCustomized) {
-        // TODO(paddle-dev): Why is there no input for this op_handle?
-        auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
-        auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType();
-        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0],
-                              out_dtype);
-      }
-      // This assumes the backward generating code will ensure IsScaleLossOp
-      // is true only for the op that scale the final scalar loss.
-      // It also assumes backward op will always follow the forward op in
-      // the block.
-      is_forwarding = false;
+    if (DealWithSpecialOp(&result, node)) {
+      continue;
     } else {
-      int op_dev_id = GetOpDeviceID(result, node, sharded_var_device);
-      if (op_dev_id != -1) {  // This op only runs on one specific device.
-        CreateComputationalOp(&result, node, op_dev_id);
-        for (ir::Node *n : node->outputs) {
-          sharded_var_device.emplace(n->Name(), op_dev_id);
-        }
+      // This op runs on all devices
+      if (IsScaleLossOp(node)) {
+        // user can customize loss@grad if not use_default_grad_scale_
+        InsertScaleLossGradOp(&result, node);
+        // This assumes the backward generating code will ensure IsScaleLossOp
+        // is true only for the op that scale the final scalar loss.
+        // It also assumes backward op will always follow the forward op in
+        // the block.
+        is_forwarding = false;
       } else {
-        // This op runs on all devices, and its output may have parameter's
-        // gradients.
-        // TODO(paddle-dev): Why is so special about "read" op?
-        if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
-          node->Op()->SetAttr("throw_eof_exp", false);
-          CreateComputationalOps(&result, node, places_.size());
-          const auto &data_var_names = node->Op()->Output("Out");
-          InsertDataBalanceOp(&result, data_var_names);
-        } else {
-          CreateComputationalOps(&result, node, places_.size());
-        }
+        CreateComputationalOps(&result, node, places_.size());
+      }
+
+      // Insert collection ops
+      if (!is_forwarding && insert_collection_ops) {
+        try {
+          bool is_bk_op =
+              static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
+                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                                static_cast<int>(OpRole::kBackward));
+          if (!is_bk_op) continue;
 
-        if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
           // Currently, we assume that once gradient is generated, it can be
           // broadcast, and each gradient is only broadcast once.
-          if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
-                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                                static_cast<int>(OpRole::kBackward))) {
-            try {
-              auto backward_vars = boost::get<std::vector<std::string>>(
-                  node->Op()->GetNullableAttr(
-                      OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-              PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
-              for (size_t i = 0; i < backward_vars.size(); i += 2) {
-                auto &p_name = backward_vars[i];
-                auto &g_name = backward_vars[i + 1];
-                VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
-
-                switch (strategy_.reduce_) {
-                  case BuildStrategy::ReduceStrategy::kReduce:
-                    cur_device_id = GetAppropriateDeviceID({g_name});
-                    CreateReduceOp(&result, g_name, cur_device_id);
-                    sharded_var_device.emplace(g_name, cur_device_id);
-                    if (!is_dist_train) {
-                      bcast_var_name_set[cur_device_id].emplace(p_name);
-                    }
-                    break;
-                  case BuildStrategy::ReduceStrategy::kAllReduce:
-                    if (IsSparseGradient(g_name)) {
-                      CreateReduceOp(&result, g_name, 0);
-                      CreateBroadcastOp(&result, g_name, 0);
-                    } else {
-                      InsertAllReduceOp(&result, g_name);
-                    }
-                    break;
-                  default:
-                    LOG(FATAL) << "Unknown reduce strategy ";
-                    break;
-                }
-              }
-            } catch (boost::bad_get e) {
-            }
+          auto backward_vars =
+              boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                  OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+          PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+          for (size_t i = 0; i < backward_vars.size(); i += 2) {
+            auto &p_name = backward_vars[i];
+            auto &g_name = backward_vars[i + 1];
+            VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+
+            InsertCollectiveOp(&result, p_name, g_name);
           }
+        } catch (boost::bad_get e) {
         }
       }
     }
   }
-  bool use_gpu = false;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  use_gpu = nccl_ctxs_ != nullptr;
-#endif
 
-  // Insert broadcast operators principle:
-  // 1. Broadcast optimized parameters in Reduce strategy;
-  // 2. No need broadcast optimized parameters in AllReduce strategy because of
-  //    the optimization sub-graph would be run on every GPU;
-  // 3. Allways broadcast received parameters in Distribute Training.
-  if ((use_gpu &&
-       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
-      is_dist_train) {
-    if (strategy_.fuse_broadcast_op_) {
-      CreateFusedBroadcastOp(&result, bcast_var_name_set);
-    } else {
-      for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
-        auto &to_bcast_set = bcast_var_name_set[dev_id];
-        for (auto &bcast_name : to_bcast_set) {
-          CreateBroadcastOp(&result, bcast_name, dev_id);
-        }
-      }
-    }
-  }
+  InsertPostprocessOps(&result);
+
   /*
   Dependency graph has been constructed. However, there are still data
   hazards need to be handled.
- */
+  */
   PolishGraphToSupportDataHazards(&result);
 
   /*
@@ -469,15 +230,77 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   return graph;
 }
 
-bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
-  PADDLE_ENFORCE(all_vars_.count(og) != 0);
-  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
-    return true;
+void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
+    ir::Graph *result, const ir::Node *node) const {
+  // user can customize loss@grad if not use_default_grad_scale_
+  size_t loss_scale = 0;
+  switch (this->strategy_.gradient_scale_) {
+    case BuildStrategy::GradientScaleStrategy::kOne:
+      loss_scale = 1;
+      break;
+    case BuildStrategy::GradientScaleStrategy::kCoeffNumDevice:
+      loss_scale = Get<size_t>(kNRanks);
+      break;
+    case BuildStrategy::GradientScaleStrategy::kCustomized:
+      loss_scale = 0;
+      break;
+    default:
+      LOG(FATAL) << "Unknown gradient scale strategy.";
+      break;
+  }
+
+  if (loss_scale) {
+    // TODO(paddle-dev): Why is there no input for this op_handle?
+    auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
+    auto out_dtype = this->all_vars_.at(loss_grad_name)->GetDataType();
+    this->CreateScaleLossGradOp(result, loss_grad_name, node->outputs[0],
+                                loss_scale, out_dtype);
+  }
+}
+
+std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
+    const ir::Graph &graph) const {
+  return ir::TopologySortOperations(graph);
+}
+
+bool MultiDevSSAGraphBuilderBase::UseGPU() const {
+  bool use_gpu = false;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  use_gpu = nccl_ctxs_ != nullptr;
+#endif
+  return use_gpu;
+}
+
+bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const {
+  return Get<size_t>(kNRanks) > 1;
+}
+
+void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
+                                                    ir::Node *node,
+                                                    size_t place_id) const {
+  auto p = places_[place_id];
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
+
+  for (ir::Node *input : node->inputs) {
+    VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id);
+    op_handle->AddInput(var);
+  }
+
+  for (ir::Node *output : node->outputs) {
+    ir::Node *new_node = nullptr;
+    if (output->Var()) {
+      new_node = result->CreateVarNode(output->Var());
+    } else {
+      new_node =
+          result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
+    }
+    CreateOpOutput(result, op_handle, new_node, p, place_id);
   }
-  return false;
 }
 
-void MultiDevSSAGraphBuilder::SetCommunicationContext(
+void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
     OpHandleBase *op_handle, const platform::Place &p) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   if (nccl_ctxs_ == nullptr) {
@@ -490,9 +313,9 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
 #endif
 }
 
-void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
-                                                const std::string &p_name,
-                                                size_t src_dev_id) const {
+void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
+                                                    const std::string &p_name,
+                                                    size_t src_dev_id) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   auto *op_handle = new BroadcastOpHandle(
       result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
@@ -520,7 +343,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
+void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
     ir::Graph *result,
     const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
@@ -558,17 +381,17 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
-                                                    ir::Node *node,
-                                                    int dev_id) const {
+void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
+                                                        ir::Node *node,
+                                                        int dev_id) const {
   result->Get<GraphOps>(kGraphOps).emplace_back(
       new ComputationOpHandle(result->CreateOpNode(node->Op()),
                               local_scopes_[dev_id], places_[dev_id], dev_id));
   CreateOpHandleIOs(result, node, dev_id);
 }
 
-void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
-                                                const std::string &og) const {
+void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
+    ir::Graph *result, const std::string &og) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -596,77 +419,15 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
   }
 }
 
-void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
-    ir::Graph *result, const std::vector<std::string> &datas) const {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
-      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
-      local_scopes_, places_, nccl_ctxs_));
-#else
-  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
-      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
-      local_scopes_, places_));
-#endif
-  auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &p = places_[i];
-    SetCommunicationContext(op_handle, p);
-    for (const std::string &d_name : datas) {
-      auto &vars = result->Get<GraphVars>(kGraphVars)[i][d_name];
-      PADDLE_ENFORCE(!vars.empty());
-      op_handle->AddInput(vars.back());
-      auto var = new VarHandle(
-          result->CreateEmptyNode(d_name, ir::Node::Type::kVariable),
-          vars.size(), i, d_name, p);
-      vars.emplace_back(var);
-      op_handle->AddOutput(var);
-    }
-  }
-}
-
-int MultiDevSSAGraphBuilder::GetOpDeviceID(
-    const ir::Graph &graph, ir::Node *node,
-    const std::unordered_map<std::string, int> &sharded_var_device) const {
-  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
-    return -1;
-  }
-  int op_role = boost::get<int>(
-      node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
-  if (op_role != static_cast<int>(framework::OpRole::kOptimize)) {
-    return -1;
-  }
-  auto param_grad = boost::get<std::vector<std::string>>(
-      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(graph, param_grad[1], sharded_var_device);
-  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
-                    node->Op()->Type(), param_grad[0], param_grad[1]);
-  return dev_id;
-}
-
-int MultiDevSSAGraphBuilder::GetVarDeviceID(
-    const ir::Graph &graph, const std::string &varname,
-    const std::unordered_map<std::string, int> &sharded_var_device) const {
-  auto got = sharded_var_device.find(varname);
-  if (got == sharded_var_device.end()) {
-    auto pos = varname.find(framework::kNewGradSuffix);
-    if (pos != std::string::npos) {
-      got = sharded_var_device.find(varname.substr(0, pos));
-    }
-  }
-  return got == sharded_var_device.end() ? -1 : got->second;
-}
-
-void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
+void MultiDevSSAGraphBuilderBase::CreateScaleLossGradOp(
     ir::Graph *result, const std::string &loss_grad_name,
-    ir::Node *out_var_node, proto::VarType::Type dtype) const {
+    ir::Node *out_var_node, size_t loss_scale,
+    proto::VarType::Type dtype) const {
   for (size_t i = 0; i < places_.size(); ++i) {
-    // Insert ScaleCost OpHandle
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
     auto *op_handle = new ScaleLossGradOpHandle(
         result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
-        local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype);
+        loss_scale, local_scopes_[i], places_[i], dev_ctx, dtype);
     result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
     // FIXME: Currently ScaleLossGradOp only use device_count as scale
@@ -680,9 +441,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
-                                                     ir::Node *node,
-                                                     size_t num_places) const {
+void MultiDevSSAGraphBuilderBase::CreateComputationalOps(
+    ir::Graph *result, ir::Node *node, size_t num_places) const {
   for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
     auto p = places_[scope_idx];
     auto s = local_scopes_[scope_idx];
@@ -692,9 +452,9 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
   }
 }
 
-VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
-                                                   const std::string &og,
-                                                   int dst_dev_id) const {
+VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result,
+                                                       const std::string &og,
+                                                       int dst_dev_id) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
@@ -723,53 +483,273 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
   return var;
 }
 
-int MultiDevSSAGraphBuilder::CreateDistTrainOp(
-    ir::Graph *result, ir::Node *node,
-    std::unordered_map<std::string, int> *sharded_var_device) const {
-  int op_dev_id = -1;
-  std::vector<std::string> input_var_names;
-  std::vector<std::string> output_var_names;
-  for (ir::Node *input : node->inputs) {
-    input_var_names.push_back(input->Name());
+bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const {
+  return boost::get<int>(
+             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+             (static_cast<int>(OpRole::kBackward) |
+              static_cast<int>(OpRole::kLoss)) &&
+         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
+}
+
+bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
+    const std::string &og) const {
+  PADDLE_ENFORCE(all_vars_.count(og) != 0);
+  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
+    return true;
   }
-  for (ir::Node *output : node->outputs) {
-    output_var_names.push_back(output->Name());
+  return false;
+}
+
+void AllReduceSSAGraphBuilder::InsertCollectiveOp(
+    ir::Graph *result, const std::string &p_name,
+    const std::string &g_name) const {
+  if (IsSparseGradient(g_name)) {
+    CreateReduceOp(result, g_name, 0);
+    CreateBroadcastOp(result, g_name, 0);
+  } else {
+    CreateAllReduceOp(result, g_name);
   }
+}
 
-  if (node->Op()->Type() == "split_byref" ||
-      node->Op()->Type() == "split_selected_rows" ||
-      node->Op()->Type() == "split_ids") {
-    // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id =
-        GetVarDeviceID(*result, input_var_names[0], *sharded_var_device);
-    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-      op_dev_id = GetAppropriateDeviceID(input_var_names);
-      for (auto &varname : input_var_names) {
-        sharded_var_device->emplace(varname, op_dev_id);
+int BalanceVarSSAGraphBuilder::GetVarDeviceID(
+    const std::string &varname) const {
+  auto got = sharded_var_device_.find(varname);
+  if (got == sharded_var_device_.end()) {
+    auto pos = varname.find(framework::kNewGradSuffix);
+    if (pos != std::string::npos) {
+      got = sharded_var_device_.find(varname.substr(0, pos));
+    }
+  }
+  return got == sharded_var_device_.end() ? -1 : got->second;
+}
+
+int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
+  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
+    return -1;
+  }
+  if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
+    return -1;
+  }
+  auto param_grad = boost::get<std::vector<std::string>>(
+      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  int dev_id = GetVarDeviceID(param_grad[1]);
+  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
+                    node->Op()->Type(), param_grad[0], param_grad[1]);
+  return dev_id;
+}
+
+size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID(
+    const std::vector<std::string> &var_names) const {
+  int64_t numel_sum = 0;
+  for (auto var_name : var_names) {
+    if (all_vars_.find(var_name) == all_vars_.end()) continue;
+    auto var_desc = all_vars_.at(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GT(numel, 0);
+    numel_sum += numel;
+  }
+
+  auto smallest =
+      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
+  size_t dev_id =
+      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
+  balance_vars_[dev_id] += numel_sum;
+  return dev_id;
+}
+
+void BalanceVarSSAGraphBuilder::ResetState() const {
+  balance_vars_.clear();
+  sharded_var_device_.clear();
+
+  balance_vars_.resize(places_.size(), 0);
+}
+
+void ReduceSSAGraphBuilder::Init() const {
+  MultiDevSSAGraphBuilderBase::Init();
+  ResetState();
+}
+
+void ReduceSSAGraphBuilder::ResetState() const {
+  BalanceVarSSAGraphBuilder::ResetState();
+  bcast_var_name_set_.clear();
+  bcast_var_name_set_.resize(places_.size());
+}
+
+void ReduceSSAGraphBuilder::InsertCollectiveOp(
+    ir::Graph *result, const std::string &p_name,
+    const std::string &g_name) const {
+  size_t cur_device_id = GetAppropriateDeviceID({g_name});
+  CreateReduceOp(result, g_name, cur_device_id);
+  sharded_var_device_.emplace(g_name, cur_device_id);
+  bcast_var_name_set_[cur_device_id].emplace(p_name);
+}
+
+bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
+                                              ir::Node *node) const {
+  int op_dev_id = BalanceVarSSAGraphBuilder::GetOpDeviceID(node);
+  if (op_dev_id != -1) {
+    // This op only runs on one specific device.
+    CreateComputationalOp(result, node, op_dev_id);
+    for (ir::Node *n : node->outputs) {
+      sharded_var_device_.emplace(n->Name(), op_dev_id);
+    }
+    return true;
+  }
+  return false;
+}
+
+void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
+  if (UseGPU()) {
+    if (strategy_.fuse_broadcast_op_) {
+      CreateFusedBroadcastOp(result, bcast_var_name_set_);
+    } else {
+      for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
+        auto &to_bcast_set = bcast_var_name_set_[dev_id];
+        for (auto &bcast_name : to_bcast_set) {
+          CreateBroadcastOp(result, bcast_name, dev_id);
+        }
       }
     }
-    for (auto &varname : output_var_names) {
-      sharded_var_device->emplace(varname, op_dev_id);
+  }
+}
+
+int ReduceSSAGraphBuilder::GetOpDeviceID(
+    ir::Node *node,
+    std::unordered_map<std::string, std::vector<ir::Node *>> *delay_ops) const {
+  if (!OpHaveRole(*node, framework::OpRole::kOptimize)) {
+    return -1;
+  }
+
+  auto param_grad = boost::get<std::vector<std::string>>(
+      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  int dev_id = GetVarDeviceID(param_grad[1]);
+
+  if (dev_id == -1) {
+    (*delay_ops)[param_grad[1]].push_back(node);
+    return -2;
+  }
+  return dev_id;
+}
+
+std::vector<ir::Node *> ReduceSSAGraphBuilder::SortOperations(
+    const ir::Graph &graph) const {
+  std::vector<ir::Node *> sorted_ops = ir::TopologySortOperations(graph);
+  return SortForReduceMode(sorted_ops);
+}
+
+std::vector<ir::Node *> ReduceSSAGraphBuilder::SortForReduceMode(
+    const std::vector<ir::Node *> &topo_ops) const {
+  std::vector<ir::Node *> sorted_ops;
+  std::unordered_map<std::string, std::vector<ir::Node *>> delayed_op;
+  sorted_ops.reserve(topo_ops.size());
+  ResetState();
+
+  auto insert_delayed_op = [&](const std::string &var_name, int dev_id) {
+    sharded_var_device_.emplace(var_name, dev_id);
+    if (delayed_op.count(var_name)) {
+      auto &ops = delayed_op.at(var_name);
+      sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end());
+      delayed_op.at(var_name).clear();
     }
-  } else if (node->Op()->Type() == "concat") {
-    op_dev_id =
-        GetVarDeviceID(*result, input_var_names[0], *sharded_var_device);
-    for (auto &varname : output_var_names) {
-      sharded_var_device->emplace(varname, op_dev_id);
+  };
+
+  for (ir::Node *node : topo_ops) {
+    int op_dev_id = GetOpDeviceID(node, &delayed_op);
+    if (op_dev_id > -1) {
+      // This op only runs on one specific device.
+      sorted_ops.emplace_back(node);
+      for (ir::Node *n : node->outputs) {
+        insert_delayed_op(n->Name(), op_dev_id);
+      }
+    } else if (op_dev_id == -1) {
+      // This op runs on all devices, and its output may have parameter's
+      // gradients.
+      sorted_ops.emplace_back(node);
+      bool is_bk_op =
+          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kBackward));
+      if (!is_bk_op) continue;
+      // Currently, we assume that once gradient is generated, it can be
+      // broadcast, and each gradient is only broadcast once.
+      std::vector<std::string> backward_vars;
+      try {
+        backward_vars =
+            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      } catch (boost::bad_get e) {
+      }
+      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+      for (size_t i = 0; i < backward_vars.size(); i += 2) {
+        auto &g_name = backward_vars[i + 1];
+        size_t cur_device_id = GetAppropriateDeviceID({g_name});
+        insert_delayed_op(g_name, static_cast<int>(cur_device_id));
+      }
+    } else if (op_dev_id == -2) {
+      // The Op on which the Op depends has not yet been generated.
     }
-  } else {
-    LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
-    PADDLE_THROW(
-        "the distribute training related op should be in [split_byref, "
-        "concat].");
   }
 
-  PADDLE_ENFORCE(op_dev_id != -1,
-                 "can not find right place for distributed op: %s",
-                 node->Op()->Type());
+  PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size());
 
-  CreateComputationalOp(result, node, op_dev_id);
-  return op_dev_id;
+  ResetState();
+  return sorted_ops;
+}
+
+void DistSSAGraphBuilder::Init() const {
+  MultiDevSSAGraphBuilderBase::Init();
+  ResetState();
+}
+
+void DistSSAGraphBuilder::ResetState() const {
+  BalanceVarSSAGraphBuilder::ResetState();
+  bcast_var_name_set_.clear();
+  bcast_var_name_set_.resize(places_.size());
+}
+
+bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
+                                            ir::Node *node) const {
+  bool insert_op = false;
+  if (OpHaveRole(*node, OpRole::kRPC)) {
+    int op_dev_id = CreateRPCOp(result, node);
+    PADDLE_ENFORCE(op_dev_id != -1,
+                   "Can not schedule the RPC operator to the right place.");
+    if (node->Op()->Type() == "recv") {
+      auto recv_vars_attr =
+          boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE(recv_vars_attr.size() == 2UL);  // [parameter, gradient]
+      if (recv_vars_attr[0].find(".block") == std::string::npos) {
+        bcast_var_name_set_[op_dev_id].emplace(recv_vars_attr[0]);
+      }
+    }
+    insert_op = true;
+    need_broadcast_var_ = true;
+  } else if (OpHaveRole(*node, OpRole::kDist)) {
+    int op_dev_id = CreateDistTrainOp(result, node);
+    if (node->Op()->Type() == "concat") {
+      auto origin_param_name = node->Op()->OutputArgumentNames()[0];
+      bcast_var_name_set_[op_dev_id].emplace(origin_param_name);
+    }
+    insert_op = true;
+  } else {
+    int op_dev_id = GetOpDeviceID(node);
+    if (op_dev_id != -1) {  // This op only runs on one specific device.
+      CreateComputationalOp(result, node, op_dev_id);
+      for (ir::Node *n : node->outputs) {
+        sharded_var_device_.emplace(n->Name(), op_dev_id);
+      }
+      insert_op = true;
+    }
+  }
+  return insert_op;
 }
 
 void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
@@ -788,14 +768,11 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
 }
 
 // Create RPC related op handles that connects its in ops and out ops.
-int MultiDevSSAGraphBuilder::CreateRPCOp(
-    ir::Graph *result, ir::Node *node,
-    std::unordered_map<std::string, int> *sharded_var_device) const {
+int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
     // TODO(paddle-dev): getting the first var is not safe.
-    op_dev_id =
-        GetVarDeviceID(*result, node->inputs[0]->Name(), *sharded_var_device);
+    op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
     PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
                    "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
@@ -813,9 +790,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
       VLOG(10) << "send grad " << input_var_names[0] << " origin "
                << send_param_grad[1] << " place: " << op_dev_id;
       for (auto &varname : input_var_names) {
-        sharded_var_device->emplace(varname, op_dev_id);
+        sharded_var_device_.emplace(varname, op_dev_id);
       }
-      sharded_var_device->emplace(send_param_grad[1], op_dev_id);
+      sharded_var_device_.emplace(send_param_grad[1], op_dev_id);
     }
   } else if (node->Op()->Type() == "recv") {
     std::vector<std::string> output_var_names;
@@ -825,8 +802,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
     auto recv_param_grad = boost::get<std::vector<std::string>>(
         node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
     if (recv_param_grad.size() == 2U) {
-      op_dev_id =
-          GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device);
+      op_dev_id = GetVarDeviceID(recv_param_grad[1]);
       VLOG(10) << "recv param " << recv_param_grad[0]
                << " get grad place: " << recv_param_grad[1]
                << " place: " << op_dev_id;
@@ -834,7 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
       op_dev_id = GetAppropriateDeviceID(output_var_names);
     }
     for (auto &varname : output_var_names) {
-      sharded_var_device->emplace(varname, op_dev_id);
+      sharded_var_device_.emplace(varname, op_dev_id);
     }
   } else {
     // send_barrier, fetch_barrier will run on place 0;
@@ -861,8 +837,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
     for (ir::Node *output : node->outputs) {
       int outvar_dev_id = op_dev_id;
       if (node->Op()->Type() == "fetch_barrier") {
-        outvar_dev_id =
-            GetVarDeviceID(*result, output->Name(), *sharded_var_device);
+        outvar_dev_id = GetVarDeviceID(output->Name());
         PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
       }
       p = places_[outvar_dev_id];
@@ -879,21 +854,124 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
   return op_dev_id;
 }
 
-bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
-  return boost::get<int>(
-             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-             (static_cast<int>(OpRole::kBackward) |
-              static_cast<int>(OpRole::kLoss)) &&
-         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
+int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
+                                           ir::Node *node) const {
+  int op_dev_id = -1;
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  if (node->Op()->Type() == "split_byref" ||
+      node->Op()->Type() == "split_selected_rows" ||
+      node->Op()->Type() == "split_ids") {
+    // TODO(paddle-dev): getting the first var is not safe.
+    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+      op_dev_id = GetAppropriateDeviceID(input_var_names);
+      for (auto &varname : input_var_names) {
+        sharded_var_device_.emplace(varname, op_dev_id);
+      }
+    }
+    for (auto &varname : output_var_names) {
+      sharded_var_device_.emplace(varname, op_dev_id);
+    }
+  } else if (node->Op()->Type() == "concat") {
+    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    for (auto &varname : output_var_names) {
+      sharded_var_device_.emplace(varname, op_dev_id);
+    }
+  } else {
+    LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
+    PADDLE_THROW(
+        "the distribute training related op should be in [split_byref, "
+        "concat].");
+  }
+
+  PADDLE_ENFORCE(op_dev_id != -1,
+                 "can not find right place for distributed op: %s",
+                 node->Op()->Type());
+
+  CreateComputationalOp(result, node, op_dev_id);
+  return op_dev_id;
+}
+
+void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
+                                             const std::string &p_name,
+                                             const std::string &g_name) const {
+  size_t cur_device_id = 0;
+  switch (strategy_.reduce_) {
+    case BuildStrategy::ReduceStrategy::kReduce:
+      cur_device_id = GetAppropriateDeviceID({g_name});
+      CreateReduceOp(result, g_name, cur_device_id);
+      sharded_var_device_.emplace(g_name, cur_device_id);
+      break;
+    case BuildStrategy::ReduceStrategy::kAllReduce:
+      if (IsSparseGradient(g_name)) {
+        CreateReduceOp(result, g_name, 0);
+        CreateBroadcastOp(result, g_name, 0);
+      } else {
+        CreateAllReduceOp(result, g_name);
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unknown reduce strategy.";
+      break;
+  }
 }
+
+void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
+  if (need_broadcast_var_ ||
+      (UseGPU() &&
+       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) {
+    if (strategy_.fuse_broadcast_op_) {
+      CreateFusedBroadcastOp(result, bcast_var_name_set_);
+    } else {
+      for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
+        auto &to_bcast_set = bcast_var_name_set_[dev_id];
+        for (auto &bcast_name : to_bcast_set) {
+          CreateBroadcastOp(result, bcast_name, dev_id);
+        }
+      }
+    }
+  }
+}
+
+std::unordered_set<std::string> &MultiDevSSAGraphBuilder() {
+  static std::unordered_set<std::string> regs;
+  return regs;
+}
+
+static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) {
+  MultiDevSSAGraphBuilder().insert(builder_mode);
+  return 0;
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(multi_devices_pass,
-              paddle::framework::details::MultiDevSSAGraphBuilder)
-    .RequirePassAttr(paddle::framework::details::kLossVarName)
-    .RequirePassAttr(paddle::framework::details::kPlaces)
-    .RequirePassAttr(paddle::framework::details::kLocalScopes)
-    .RequirePassAttr(paddle::framework::details::kStrategy)
-    .RequirePassAttr(paddle::framework::details::kNumTrainers);
+#define REGISTER_MULTI_DEVICES_PASS(pass_name, pass_class)                     \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
+      _reg_ssa_graph_builder_##pass_name,                                      \
+      "REGISTER_MULTI_DEVICES_PASS must be called in global namespace.");      \
+  int _reg_ssa_graph_builder_entry_##pass_name =                               \
+      paddle::framework::details::MultiDevSSAGraphBuilderRegister(#pass_name); \
+  REGISTER_PASS(pass_name, pass_class)                                         \
+      .RequirePassAttr(paddle::framework::details::kLossVarName)               \
+      .RequirePassAttr(paddle::framework::details::kPlaces)                    \
+      .RequirePassAttr(paddle::framework::details::kLocalScopes)               \
+      .RequirePassAttr(paddle::framework::details::kStrategy)                  \
+      .RequirePassAttr(paddle::framework::details::kNRanks)
+
+REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass,
+                            paddle::framework::details::ReduceSSAGraphBuilder);
+REGISTER_MULTI_DEVICES_PASS(
+    allreduce_mode_multi_devices_pass,
+    paddle::framework::details::AllReduceSSAGraphBuilder);
+REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass,
+                            paddle::framework::details::DistSSAGraphBuilder);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 5736102ddc..6d4386538e 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include <string>
 #include <utility>
 #include <vector>
@@ -30,84 +31,154 @@ namespace framework {
 class Scope;
 namespace details {
 
-class MultiDevSSAGraphBuilder : public ir::Pass {
+constexpr char kLossVarName[] = "loss_var_name";
+constexpr char kPlaces[] = "places";
+constexpr char kLocalScopes[] = "local_scopes";
+constexpr char kStrategy[] = "strategy";
+constexpr char kNRanks[] = "nranks";
+
+class MultiDevSSAGraphBuilderBase : public ir::Pass {
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(
       std::unique_ptr<ir::Graph> graph) const override;
 
- private:
-  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
-                         size_t device_id) const;
-  void Init() const;
+  virtual void Init() const;
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  mutable platform::NCCLContextMap *nccl_ctxs_;
-#endif
+  virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
 
-  int GetVarDeviceID(
-      const ir::Graph &graph, const std::string &varname,
-      const std::unordered_map<std::string, int> &sharded_var_device) const;
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const = 0;
 
-  bool IsScaleLossOp(ir::Node *node) const;
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0;
 
-  int CreateRPCOp(
-      ir::Graph *result, ir::Node *node,
-      std::unordered_map<std::string, int> *sharded_var_device) const;
-  int CreateDistTrainOp(
-      ir::Graph *result, ir::Node *node,
-      std::unordered_map<std::string, int> *sharded_var_device) const;
+  virtual void InsertPostprocessOps(ir::Graph *result) const = 0;
 
-  std::vector<std::string> FindDistTrainSendVars(
-      const std::vector<ir::Node *> &nodes) const;
+  bool UseGPU() const;
 
-  std::vector<std::string> FindDistTrainRecvVars(
-      const std::vector<ir::Node *> &nodes) const;
+  bool NeedCollectiveOps() const;
+
+  bool IsScaleLossOp(ir::Node *node) const;
 
   void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                               size_t num_places) const;
 
   void CreateScaleLossGradOp(ir::Graph *result,
                              const std::string &loss_grad_name,
-                             ir::Node *out_var_node,
+                             ir::Node *out_var_node, size_t loss_scale,
                              proto::VarType::Type dtype) const;
 
   VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
+
   void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                              int dev_id) const;
 
-  int GetOpDeviceID(
-      const ir::Graph &graph, ir::Node *node,
-      const std::unordered_map<std::string, int> &sharded_var_device) const;
-
-  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
+  bool IsSparseGradient(const std::string &og) const;
 
-  void InsertDataBalanceOp(ir::Graph *result,
-                           const std::vector<std::string> &datas) const;
+  void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;
 
   void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
+  void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const;
+
   void CreateFusedBroadcastOp(
       ir::Graph *result,
       const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;
 
-  bool IsSparseGradient(const std::string &og) const;
-
-  size_t GetAppropriateDeviceID(
-      const std::vector<std::string> &var_names) const;
-
   void SetCommunicationContext(OpHandleBase *op_handle,
                                const platform::Place &p) const;
 
+  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
+                         size_t device_id) const;
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  mutable platform::NCCLContextMap *nccl_ctxs_;
+#endif
+
   mutable std::string loss_var_name_;
   mutable std::vector<platform::Place> places_;
   mutable std::vector<Scope *> local_scopes_;
 
   mutable BuildStrategy strategy_;
   mutable std::unordered_map<std::string, VarDesc *> all_vars_;
+};
+
+class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
+ protected:
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const;
+
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const {
+    return false;
+  }
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const {}
+};
+
+class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
+ protected:
+  int GetVarDeviceID(const std::string &varname) const;
+
+  int GetOpDeviceID(ir::Node *node) const;
+
+  size_t GetAppropriateDeviceID(
+      const std::vector<std::string> &var_names) const;
+
+  virtual void ResetState() const;
+
+  mutable std::unordered_map<std::string, int> sharded_var_device_;
   mutable std::vector<int64_t> balance_vars_;
 };
+
+class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
+ protected:
+  virtual void Init() const;
+
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const;
+
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const;
+
+  virtual std::vector<ir::Node *> SortOperations(const ir::Graph &graph) const;
+
+  virtual void ResetState() const;
+
+  int GetOpDeviceID(ir::Node *node,
+                    std::unordered_map<std::string, std::vector<ir::Node *>>
+                        *delay_ops) const;
+
+  std::vector<ir::Node *> SortForReduceMode(
+      const std::vector<ir::Node *> &topo_ops) const;
+
+  mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
+};
+
+class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
+ protected:
+  virtual void Init() const;
+
+  virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const;
+
+  virtual void InsertPostprocessOps(ir::Graph *result) const;
+
+  virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name,
+                                  const std::string &g_name) const;
+
+  virtual void ResetState() const;
+
+  int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
+
+  int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
+
+  mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
+  mutable bool need_broadcast_var_{false};
+};
+
+std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
new file mode 100644
index 0000000000..128aaa33a2
--- /dev/null
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -0,0 +1,99 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    std::vector<std::unique_ptr<ir::Graph>> &&graphs)
+    : strategy_(std::move(strategy)),
+      local_scopes_(std::move(local_scopes)),
+      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
+      places_(std::move(places)),
+      graphs_(std::move(graphs)) {
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+
+  // set the correct size of thread pool to each device.
+  strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
+                               ? 1UL
+                               : strategy_.num_threads_ / places_.size();
+  VLOG(1) << "set num_threads: " << strategy_.num_threads_
+          << " to run the operators of the graph on each device.";
+  for (size_t i = 0; i < places.size(); ++i) {
+    executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
+        strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
+  }
+}
+
+FeedFetchList ParallelSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  std::vector<std::future<FeedFetchList>> run_futures;
+
+  std::vector<FeedFetchList> fetch_data;
+  FeedFetchList ret;
+
+  fetch_data.reserve(places_.size());
+  ret.reserve(fetch_tensors.size());
+  exception_holder_.Clear();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto call = [this, i, &fetch_tensors]() -> FeedFetchList {
+      try {
+        return executors_[i]->Run(fetch_tensors);
+      } catch (...) {
+        exception_holder_.Catch(std::current_exception());
+      }
+      return FeedFetchList();
+    };
+
+    if (pool_) {
+      run_futures.emplace_back(pool_->enqueue(std::move(call)));
+    } else {
+      fetch_data.emplace_back(std::move(call()));
+    }
+  }
+
+  if (pool_) {
+    for (auto &f : run_futures) {
+      if (exception_holder_.IsCaught()) {
+        f.wait();
+      } else {
+        fetch_data.emplace_back(std::move(f.get()));
+      }
+    }
+  }
+  if (exception_holder_.IsCaught()) {
+    exception_holder_.ReThrow();
+  }
+
+  for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
+    std::vector<const LoDTensor *> lodtensor_ptrs;
+    lodtensor_ptrs.reserve(local_scopes_.size());
+    for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
+      lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx));
+    }
+    ret.emplace_back();
+    ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace());
+  }
+  return ret;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
new file mode 100644
index 0000000000..c00c5bc2d1
--- /dev/null
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ParallelSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
+                           const std::vector<Scope *> &local_scopes,
+                           const std::vector<platform::Place> &places,
+                           std::vector<std::unique_ptr<ir::Graph>> &&graphs);
+  ~ParallelSSAGraphExecutor() final = default;
+  const ir::Graph &Graph() const override { return *graphs_[0]; }
+
+  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
+
+ private:
+  ExecutionStrategy strategy_;
+  std::vector<Scope *> local_scopes_;
+  std::unique_ptr<::ThreadPool> pool_{nullptr};
+  std::vector<platform::Place> places_;
+  std::vector<std::unique_ptr<ir::Graph>> graphs_;
+
+  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
+  ExceptionHolder exception_holder_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 57f6fc66c5..91e4f9adb4 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -56,7 +56,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     }
   }
   std::vector<framework::LoDTensor> fetch_data;
-  std::exception_ptr eptr;
+  std::exception_ptr eptr = nullptr;
   try {
     fetch_data = underlying_executor_->Run(fetch_tensors);
   } catch (...) {
@@ -64,20 +64,26 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
   }
 
   platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
-  drop_scope_counter_ += 1;
+  ++drop_scope_counter_;
 
-  if (!fetch_tensors.empty() ||
-      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
-    drop_scope_counter_ = 0;
-    // Wait All computational streams
-    for (auto p : places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
+  bool stream_end = false;
+  if (!fetch_tensors.empty()) {
+    WaitComputationalStreams();
+    stream_end = true;
+  }
+
+  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    if (!stream_end) {
+      WaitComputationalStreams();
     }
+
     for (auto &scope : local_scopes_) {
       auto &local_scope =
           *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
       scope->DeleteScope(local_scope);
     }
+
+    drop_scope_counter_ = 0;
   }
   if (eptr) {
     std::rethrow_exception(eptr);
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index 5e87e0bf50..0f6340213d 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
 
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
+ private:
+  inline void WaitComputationalStreams() {
+    // Wait All computational streams
+    for (auto p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+  }
+
  private:
   size_t drop_scope_counter_{0};
 
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 3dfd14419d..134f759081 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) {
   } else if (var->IsType<SelectedRows>()) {
     (*func)(var->GetMutable<SelectedRows>());
   } else {
-    PADDLE_THROW("Not supported type %s", var->Type().name());
+    PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
   }
 }
 
@@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) {
   } else if (var.IsType<SelectedRows>()) {
     (*func)(var.Get<SelectedRows>());
   } else {
-    PADDLE_THROW("Not supported type %s", var.Type().name());
+    PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
   }
 }
 
diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h
index 73f92fa389..88aee8379d 100644
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -16,332 +16,184 @@
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
+#include <string>
 #include <type_traits>
 
+#include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace framework {
 
 // Statically sized, statically indexed dimension
-template <int i>
-struct Dim {
-  static constexpr int dimensions = i;
+template <int D>
+class Dim : public Array<int64_t, D> {
+ public:
+  static_assert(D >= 0, "D must be not less than 0");
 
-  template <typename... Args>
-  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
-    static_assert(sizeof...(_tail) == i - 1,
-                  "Dim initialized with the wrong number of parameters");
-  }
+  static constexpr int kRank = D;
+  using BaseClass = Array<int64_t, D>;
 
-  HOSTDEVICE
-  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+  inline Dim(int64_t head, const Dim<D - 1>& tail) {
+    (*this)[0] = head;
+    new (this->GetMutable() + 1) Dim<D - 1>(tail);
+  }
 
-  HOSTDEVICE
-  Dim() : head(0), tail() {}
+  template <typename... Args>
+  HOSTDEVICE explicit Dim(int64_t head, Args... args)
+      : BaseClass(head, args...) {}
 
   /** Construct a Dim from a linear index and size.  Uses Fortran order
    * indexing. */
-  HOSTDEVICE
-  Dim(int64_t idx, const Dim<i>& size)
-      : head(idx % size.head), tail(idx / size.head, size.tail) {}
+  HOSTDEVICE Dim(int64_t idx, const Dim<D>& size);
 
   /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE
-  Dim(int64_t idx) : head(idx), tail(idx) {}
+  HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); }
 
-  HOSTDEVICE
-  bool operator==(const Dim<i>& o) const {
-    return (head == o.head) && (tail == o.tail);
-  }
-
-  HOSTDEVICE
-  bool operator!=(const Dim<i>& o) const { return !(*this == o); }
-
-  HOSTDEVICE
-  int64_t& operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
+  HOSTDEVICE Dim() = default;
 
   HOST std::string to_string() const;
-
-  int64_t head;
-  Dim<i - 1> tail;
 };
 
-// Base case specialization
-template <>
-struct Dim<0> {
-  static constexpr int dimensions = 0;
-
-  HOSTDEVICE
-  Dim(int64_t _head) {}
-
-  HOSTDEVICE
-  Dim() {}
-
-  HOSTDEVICE
-  Dim(int idx, const Dim<0>& size) {
-#ifndef __CUDA_ARCH__
-    if (idx > 0) {
-      throw std::invalid_argument("Index out of range.");
-    }
-#else
-    PADDLE_ASSERT(idx == 0);
-#endif
-  }
-
-  HOSTDEVICE
-  bool operator==(const Dim<0>& o) const { return true; }
-
-  HOSTDEVICE
-  bool operator!=(const Dim<0>& o) const { return false; }
-
-  HOSTDEVICE
-  int64_t& operator[](int idx);
-  HOSTDEVICE
-  int64_t operator[](int idx) const;
-};
-
-namespace {
-
-// Helper for accessing Dim classes
-template <int i>
-struct DimGetter {
-  // Return a copy if Dim is const
-  template <typename D>
-  HOSTDEVICE static int64_t impl(const D& d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  HOSTDEVICE static int64_t& impl(D& d) {
-    return DimGetter<i - 1>::impl(d.tail);
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct FortranOrderIndexingConstructorFunctor {
+  HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
+                                    int64_t* out) {
+    out[kStart] = (*idx) % in[kStart];
+    (*idx) /= in[kStart];
+    FortranOrderIndexingConstructorFunctor<kStart + 1, kEnd,
+                                           kStart + 1 == kEnd>::Run(in, idx,
+                                                                    out);
   }
 };
 
-// Eureka! We found the element!
-template <>
-struct DimGetter<0> {
-  // Return a copy if Dim is const
-  template <typename D>
-  HOSTDEVICE static int64_t impl(const D& d) {
-    return d.head;
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  HOSTDEVICE static int64_t& impl(D& d) {
-    return d.head;
-  }
+template <int kStart, int kEnd>
+struct FortranOrderIndexingConstructorFunctor<kStart, kEnd, true> {
+  HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx,
+                                    int64_t* out) {}
 };
+}  // namespace detail
 
 template <int D>
-HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
-}
-
-template <int D>
-HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
-  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
-#ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
-}
-
-}  // namespace
-// Static access to constant Dim
-template <int i, int l>
-HOSTDEVICE int64_t get(const Dim<l>& d) {
-  return DimGetter<i>::impl(d);
+HOSTDEVICE Dim<D>::Dim(int64_t idx, const Dim<D>& size) {
+  detail::FortranOrderIndexingConstructorFunctor<0, D, D == 0>::Run(
+      size.Get(), &idx, this->GetMutable());
 }
 
-// Static access to mutable Dim
-template <int i, int l>
-HOSTDEVICE int64_t& get(Dim<l>& d) {
-  return DimGetter<i>::impl(d);
+template <int idx, int D>
+HOSTDEVICE inline int64_t get(const Dim<D>& dim) {
+  return dim[idx];
 }
 
-// Dynamic access to constant Dim
-template <int l>
-HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
-  return indexer(*this, i);
+template <int idx, int D>
+HOSTDEVICE inline int64_t& get(Dim<D>& dim) {  // NOLINT
+  return dim[idx];
 }
 
-// Dynamic access to mutable Dim
-template <int l>
-HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
-  return indexer(*this, i);
-}
-
-// Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-// without std::enable_if will try to instantiate this on get<0>(d)
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
-                                                               int i) {
-  return d[i];
+template <int D>
+HOSTDEVICE inline int64_t get(const Dim<D>& dim, int idx) {
+  return dim[idx];
 }
 
-// Dynamic access to mutable Dim
-template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
-                                                                int i) {
-  return d[i];
+template <int D>
+HOSTDEVICE inline int64_t& get(Dim<D>& dim, int idx) {  // NOLINT
+  return dim[idx];
 }
 
 // Dot product of two dims
-template <int i>
-HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
-  return a.head * b.head + linearize(a.tail, b.tail);
-}
-
-// Base case dot product of two Dims
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
-  return 0;
+template <int D>
+HOSTDEVICE inline int64_t linearize(const Dim<D>& a, const Dim<D>& b) {
+  return UnrollProduct<D>::Run(a.Get(), b.Get());
 }
 
 // Product of a Dim
-template <int i>
-HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
-  return prod * a.head * product(a.tail);
-}
-
-// Base case product of a Dim
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
-  return prod;
+template <int D>
+HOSTDEVICE inline int64_t product(const Dim<D>& a) {
+  return UnrollProduct<D>::Run(a.Get());
 }
 
 // Is 0 <= idx_i < size_i for all i?
-template <int i>
-HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
-  return ((0 <= idx.head) && (idx.head < size.head) &&
-          contained(idx.tail, size.tail));
-}
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct ContainedFunctor {
+  HOSTDEVICE static inline bool Run(const int64_t* idx, const int64_t* size) {
+    return (idx[kStart] >= 0 && idx[kStart] < size[kStart]) &&
+           ContainedFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(idx,
+                                                                       size);
+  }
+};
 
-// Base case of is 0 <= idx_i < size_i ?
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) {
-  return true;
+template <int kStart, int kEnd>
+struct ContainedFunctor<kStart, kEnd, true> {
+  HOSTDEVICE static constexpr inline bool Run(const int64_t* idx,
+                                              const int64_t* size) {
+    return true;
+  }
+};
+}  // namespace detail
+
+template <int D>
+HOSTDEVICE inline bool contained(const Dim<D>& idx, const Dim<D>& size) {
+  return detail::ContainedFunctor<0, D, D == 0>::Run(idx.Get(), size.Get());
 }
 
 /**
  * \brief Compute exclusive prefix-multiply of a Dim.
  */
-template <int i>
-HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
-  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
-}
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct ExPrefixMulFunctor {
+  HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {
+    kStart == 0 ? out[kStart] = 1 : out[kStart] =
+                                        out[kStart - 1] * in[kStart - 1];
+    detail::ExPrefixMulFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(in,
+                                                                          out);
+  }
+};
+
+template <int kStart, int kEnd>
+struct ExPrefixMulFunctor<kStart, kEnd, true> {
+  HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {}
+};
+}  // namespace detail
 
-///\cond HIDDEN
-// Base case of ex_prefix_mul
-// Notice it is inline because it is no longer a template
-template <>
-HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE inline Dim<D> ex_prefix_mul(const Dim<D>& src) {
+  Dim<D> ret;
+  detail::ExPrefixMulFunctor<0, D, D == 0>::Run(src.Get(), ret.GetMutable());
+  return ret;
 }
-///\endcond
 
 /**
  * Add two dimensions together
  */
-template <int i>
-HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {
-  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
-}
-
-// Base case
-template <>
-HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE inline Dim<D> dim_plus(const Dim<D>& a, const Dim<D>& b) {
+  Dim<D> ret;
+  UnrollAdd<D>::Run(a.Get(), b.Get(), ret.GetMutable());
+  return ret;
 }
 
-template <int i>
-HOSTDEVICE Dim<i> operator+(const Dim<i>& lhs, const Dim<i>& rhs) {
+template <int D>
+HOSTDEVICE inline Dim<D> operator+(const Dim<D>& lhs, const Dim<D>& rhs) {
   return dim_plus(lhs, rhs);
 }
 
 /**
  * Multiply two dimensions together
  */
-template <int i>
-HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {
-  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
-}
-
-// Base case
-template <>
-HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE inline Dim<D> dim_mult(const Dim<D>& a, const Dim<D>& b) {
+  Dim<D> ret;
+  UnrollMul<D>::Run(a.Get(), b.Get(), ret.GetMutable());
+  return ret;
 }
 
-template <int i>
-HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
+template <int D>
+HOSTDEVICE Dim<D> operator*(const Dim<D>& lhs, const Dim<D>& rhs) {
   return dim_mult(lhs, rhs);
 }
 
@@ -354,23 +206,32 @@ HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
  * \return Dim object the same size as \p size with normalized strides
  *
  */
+namespace detail {
+template <int kStart, int kEnd, bool kStop>
+struct NormalizeStridesFunctor {
+  HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
+                             int64_t* ret) {
+    ret[kStart] = (size[kStart] == 1 ? 0 : stride[kStart]);
+    NormalizeStridesFunctor<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
+        size, stride, ret);
+  }
+};
 
-template <int i>
-HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
-}
-
-///\cond HIDDEN
+template <int kStart, int kEnd>
+struct NormalizeStridesFunctor<kStart, kEnd, true> {
+  HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride,
+                             int64_t* ret) {}
+};
+}  // namespace detail
 
-template <>
-HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
-                                           const Dim<0>& stride) {
-  return Dim<0>();
+template <int D>
+HOSTDEVICE Dim<D> normalize_strides(const Dim<D>& size, const Dim<D>& stride) {
+  Dim<D> ret;
+  detail::NormalizeStridesFunctor<0, D, D == 0>::Run(size.Get(), stride.Get(),
+                                                     ret.GetMutable());
+  return ret;
 }
 
-///\endcond
-
 /**
  * Helper function to create a Dim
  *
@@ -379,25 +240,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
  */
 
 template <typename... Args>
-HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+HOSTDEVICE inline Dim<sizeof...(Args)> make_dim(Args... idxes) {
   return Dim<sizeof...(Args)>(idxes...);
 }
 
 // Allows us to output a Dim
-// XXX For some reason, overloading fails to resolve this correctly
-template <int i>
-typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
-    std::ostream& os, const Dim<i>& d) {
-  os << d.head << ", " << d.tail;
-  return os;
-}
-
-// Base case that allows us to output a Dim
-// XXX I wish this could be an overload instead of a template
-template <int i>
-typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
-    std::ostream& os, const Dim<i>& d) {
-  os << d.head;
+template <int D>
+inline std::ostream& operator<<(std::ostream& os, const Dim<D>& d) {
+  os << d[0];
+  for (int i = 1; i < D; ++i) {
+    os << ", " << d[i];
+  }
   return os;
 }
 
@@ -405,17 +258,15 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
   return os;
 }
 
-template <int i>
-HOST std::string Dim<i>::to_string() const {
+template <int D>
+HOST std::string Dim<D>::to_string() const {
   std::stringstream stream;
-
   stream << *this;
-
   return stream.str();
 }
 
 template <int D>
-HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
+HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, const Dim<D>& extents) {
   Dim<D> result;
 
   for (int i = 0; i < D - 1; ++i) {
@@ -428,5 +279,10 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
   return result;
 }
 
+template <int D, typename T1, typename T2>
+inline void static_dim_assign(const T1* in, T2* out) {
+  UnrollAssign<D>::Run(in, out);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index eaef093ed3..39652706c4 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -59,7 +59,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {
 
 struct DLContextVisitor : public boost::static_visitor<::DLContext> {
   inline ::DLContext operator()(const platform::CPUPlace &place) const {
-    DLContext ctx;
+    ::DLContext ctx;
     ctx.device_type = kDLCPU;
     ctx.device_id = 0;
     return ctx;
@@ -67,7 +67,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
 
   inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
-    DLContext ctx;
+    ::DLContext ctx;
     ctx.device_type = kDLGPU;
     ctx.device_id = place.device;
     return ctx;
@@ -78,7 +78,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
 
   inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
 #ifdef PADDLE_WITH_CUDA
-    DLContext ctx;
+    ::DLContext ctx;
     ctx.device_type = kDLCPUPinned;
     ctx.device_id = 0;
     return ctx;
diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h
index 0c52bce1ef..e48b0d5c88 100644
--- a/paddle/fluid/framework/dlpack_tensor.h
+++ b/paddle/fluid/framework/dlpack_tensor.h
@@ -38,7 +38,7 @@ class DLPackTensor {
 
   // The shape in DLTensor is defined as int64_t*
   // Add this member to make TVMTensor init without heap allocation
-  ShapeType shape_[9];
+  ShapeType shape_[DDim::kMaxRank];
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index da9556c6c1..c93bbe7cee 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -119,7 +119,7 @@ static void DeleteUnusedTensors(
           }
         } else {
           PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                       var->Type().name(), name);
+                       framework::ToTypeName(var->Type()), name);
         }
       }
     }
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 2eb9e564f8..4972bc7ec3 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
 namespace framework {
@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() {
   return;
 #else
   static unsigned concurrency_cap = std::thread::hardware_concurrency();
+  LOG(WARNING) << "concurrency capacity " << concurrency_cap;
   int thread_id = this->thread_id_;
 
   if (static_cast<unsigned>(thread_id) < concurrency_cap) {
@@ -238,6 +240,55 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
   VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
 }
 
+void ExecutorThreadWorker::TrainFilesWithTimer() {
+  platform::SetNumThreads(1);
+  SetDevice();
+  thread_reader_->Start();
+  std::vector<double> op_total_time;
+  std::vector<std::string> op_name;
+  for (auto& op : ops_) {
+    op_name.push_back(op->Type());
+  }
+  op_total_time.resize(ops_.size());
+  for (size_t i = 0; i < op_total_time.size(); ++i) {
+    op_total_time[i] = 0.0;
+  }
+  platform::Timer timeline;
+  double total_time = 0.0;
+  double read_time = 0.0;
+  int cur_batch;
+  int batch_cnt = 0;
+  timeline.Start();
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    timeline.Pause();
+    read_time += timeline.ElapsedSec();
+    total_time += timeline.ElapsedSec();
+    for (size_t i = 0; i < ops_.size(); ++i) {
+      timeline.Start();
+      ops_[i]->Run(*thread_scope_, place_);
+      timeline.Pause();
+      op_total_time[i] += timeline.ElapsedSec();
+      total_time += timeline.ElapsedSec();
+    }
+    ++batch_cnt;
+    thread_scope_->DropKids();
+    if (thread_id_ == 0) {
+      if (batch_cnt > 0 && batch_cnt % 1000 == 0) {
+        for (size_t i = 0; i < ops_.size(); ++i) {
+          fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
+                  op_name[i].c_str(), op_total_time[i] / batch_cnt);
+        }
+        fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
+        int fetch_var_num = fetch_var_names_.size();
+        for (int i = 0; i < fetch_var_num; ++i) {
+          print_fetch_var(thread_scope_, fetch_var_names_[i]);
+        }
+      }
+    }
+    timeline.Start();
+  }
+}
+
 void ExecutorThreadWorker::TrainFiles() {
   platform::SetNumThreads(1);
 
@@ -320,10 +371,12 @@ void AsyncExecutorThreadWorker::SetPSlibPtr(
     std::shared_ptr<paddle::distributed::PSlib> pslib_ptr) {
   _pslib_ptr = pslib_ptr;
 }
+
 void AsyncExecutorThreadWorker::SetPullDenseThread(
     std::shared_ptr<DensePullThread> dpt) {
   _pull_dense_thread = dpt;
 }
+
 void AsyncExecutorThreadWorker::TrainOneNetwork() {
   PrepareParams();
 
diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h
index 30b81ad880..524922b032 100644
--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -155,6 +155,8 @@ class ExecutorThreadWorker {
   void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
   // A multi-thread training function
   virtual void TrainFiles();
+  // with timer log
+  virtual void TrainFilesWithTimer();
   // set fetch variable names from python interface assigned by users
   void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
 #ifdef PADDLE_WITH_PSLIB
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b7f7e2ee8e..6d795e1e2d 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -45,6 +45,7 @@ pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
+pass_library(conv_affine_channel_fuse_pass inference)
 if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base)
     pass_library(depthwise_conv_mkldnn_pass base)
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
new file mode 100644
index 0000000000..a7bfb8cf1e
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -0,0 +1,222 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
+#include <functional>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_CONV_BN_NODES(pattern_name)                                    \
+  /* OPERATORS */                                                          \
+  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
+  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
+  /* CONV inputs */                                                        \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
+  /* CONV outputs */                                                       \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
+  /* Affine Channel inputs */                                              \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
+  /* Affine channel outputs */                                             \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
+
+void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
+                                const ir::Node& ac_scale,
+                                const LoDTensor& ac_bias_tensor,
+                                LoDTensor* eltwise_y_in_tensor) {
+  using EigenVectorArrayMap =
+      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using ConstEigenVectorArrayMap =
+      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using EigenMatrixArrayMap = Eigen::Map<
+      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+  // Re-compute bias of conv2d from AffineChannel
+  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims());
+
+  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
+
+  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
+                                       scale_tensor->numel(), 1);
+  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
+                                         ac_bias_tensor.numel(), 1);
+
+  EigenVectorArrayMap eltwise_y_in_array(
+      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+      eltwise_y_in_tensor->numel(), 1);
+
+  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
+
+  // Re-compute weight of conv2d from AffineChannel
+  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
+  auto weights_shape = weights->dims();
+  auto weights_shape_2d = flatten_to_2d(weights_shape, 1);
+
+  EigenMatrixArrayMap weights_array_2d(
+      weights->mutable_data<float>(platform::CPUPlace()), weights_shape_2d[0],
+      weights_shape_2d[1]);
+
+  weights_array_2d.colwise() *= scale_array;
+}
+
+std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
+                                              name_scope_);
+  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
+
+  int found_conv_ac_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvAffineChannel fuse";
+
+    GET_CONV_BN_NODES(conv_ac_pattern);
+
+    // check if fuse can be done and if MKL-DNN should be used
+    FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel);
+    if (fuse_option == DO_NOT_FUSE) {
+      VLOG(3) << "do not perform conv+affinechannel fuse";
+      return;
+    }
+
+    // Create eltwise_y (conv bias) variable
+    VarDesc eltwise_y_in_desc(
+        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
+    eltwise_y_in_desc.SetPersistable(true);
+    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
+    auto* eltwise_y_in_tensor =
+        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
+
+    // Get affine_channel bias
+    auto* ac_bias_tensor =
+        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
+
+    // Initialize eltwise_y
+    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
+    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+                eltwise_y_in_tensor->numel(), 0.0f);
+
+    // update weights and biases
+    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
+                               eltwise_y_in_tensor);
+
+    // create an elementwise add node.
+    OpDesc desc;
+    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
+    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
+    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
+    desc.SetType("elementwise_add");
+    desc.SetAttr("axis", 1);
+    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
+
+    GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel});
+
+    IR_NODE_LINK_TO(conv_out, eltwise_op);
+    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
+    IR_NODE_LINK_TO(eltwise_op, ac_out);
+    found_conv_ac_count++;
+  };
+
+  gpd(graph.get(), handler);
+
+  AddStatis(found_conv_ac_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+
+  auto* scope = param_scope();
+  PADDLE_ENFORCE(scope);
+
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
+                                              name_scope_);
+  conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
+
+  int found_conv_ac_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBN fuse";
+
+    GET_CONV_BN_NODES(conv_ac_pattern);
+    // OPERATORS
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
+    // BIAS inputs
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
+    // BIAS outputs
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
+
+    // Get eltwise_y (conv bias) variable
+    auto* eltwise_y_in_tensor =
+        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
+
+    // Get batch norm bias
+    auto* ac_bias_tensor =
+        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
+
+    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
+                               eltwise_y_in_tensor);
+
+    // Update the elementwise_add node
+    eltwise->Op()->SetAttr("axis", 1);
+    eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
+
+    GraphSafeRemoveNodes(graph.get(),
+                         {ac_scale, ac_bias, affine_channel, eltwise_out});
+
+    IR_NODE_LINK_TO(eltwise, ac_out);
+
+    found_conv_ac_count++;
+  };
+
+  gpd(graph.get(), handler);
+  AddStatis(found_conv_ac_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_affine_channel_fuse_pass,
+              paddle::framework::ir::ConvAffineChannelFusePass);
+REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
+              paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
new file mode 100644
index 0000000000..ad966e11e6
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the Conv and ConvAffineChannel.
+ */
+class ConvAffineChannelFusePass : public FusePassBase {
+ public:
+  virtual ~ConvAffineChannelFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_affine_channel_fuse"};
+};
+
+class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
+ public:
+  virtual ~ConvEltwiseAddAffineChannelFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index 23f343f631..c6121777e8 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc(
     const std::string& output) {
   auto proto = base_desc;
   framework::OpDesc desc(proto, nullptr);
+  desc.SetType("conv2d_fusion");
   desc.SetInput("Bias", {bias});
   desc.SetInput("ResidualData", {bias1});
   desc.SetAttr("activation", activation);
   desc.SetOutput("Output", {output});
   desc.SetAttr("is_test", true);
-
+  desc.SetAttr("use_cudnn", false);
+  desc.Flush();
   return *desc.Proto();
 }
 
 std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  const std::string pattern_name = "conv_elementwise_add2_act_fuse";
   FusePassBase::Init(pattern_name, graph.get());
 
   GraphPatternDetector gpd;
@@ -76,22 +78,23 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
     framework::OpDesc new_op_desc(new_op_proto, nullptr);
 
     // Create a new node for the fused op.
-    graph->CreateOpNode(&new_op_desc);
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
 
     // Link inputs and outputs.
     PADDLE_ENFORCE(subgraph.count(x));
     auto* conv_in_node = subgraph.at(x);
 
-    IR_NODE_LINK_TO(conv_in_node, conv_op);            // Input
-    IR_NODE_LINK_TO(conv_filter, conv_op);             // Filter
-    IR_NODE_LINK_TO(conv_op, conv_out);                // Output
-    IR_NODE_LINK_TO(elementwise_add_in_y, conv_op);    // Bias
-    IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op);  // Bias
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output
 
     // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(),
-                         {conv_op, elementwise_add_op, elementwise_add_op_1,
-                          elementwise_add_out});
+    GraphSafeRemoveNodes(
+        graph.get(),
+        {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
+         elementwise_add_out, elementwise_add_out_1, act_op});
   };
   gpd(graph.get(), handler);
   return graph;
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 8670dcfed7..3eb5bdba3b 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -23,66 +23,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
-namespace {
-
-void CheckProgram(const ProgramDesc &program) {
-#define _INT(role) static_cast<int>(role)
-
-  std::map<int, bool> visit;
-  for (OpDesc *op : program.Block(0).AllOps()) {
-    // For backward compatibility, some program doesn't have role added.
-    if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
-    int role_id =
-        boost::get<int>(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
-    visit[role_id] = true;
-    switch (role_id) {
-      case _INT(OpRole::kForward):
-        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-          LOG(ERROR) << "Cannot add backward operator before forward operator "
-                     << op->Type();
-        }
-        break;
-      case _INT(OpRole::kBackward):
-      case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
-        PADDLE_ENFORCE(
-            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-            "Cannot add backward operator %s after optimize operator.",
-            op->Type());
-        break;
-      case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
-        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
-                                  _INT(OpRole::kLoss)) == visit.end(),
-                       "Cannot add backward|loss operator before "
-                       "forward|loss operator %s.",
-                       op->Type());
-        PADDLE_ENFORCE(
-            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-            "Cannot add forward|loss operator %s after optimize operator.",
-            op->Type());
-        break;
-      case _INT(OpRole::kOptimize):
-      case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
-        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
-                       "Optimize operators %s must follow backward operator.",
-                       op->Type());
-        break;
-      case _INT(OpRole::kLRSched):
-      case _INT(OpRole::kDist):
-      case _INT(OpRole::kRPC):
-      case _INT(OpRole::kNotSpecified):
-        break;
-      default:
-        LOG(FATAL) << "Unknown operator role. Don't add new role because "
-                      "you don't know what you are doing.";
-    }
-  }
-
-#undef _INT
-}
-}  // namespace
 
 Graph::Graph(const ProgramDesc &program) : program_(program) {
-  CheckProgram(program_);
   auto var_nodes = InitFromProgram(program_);
   ResolveHazard(var_nodes);
 }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 13d752e516..c513fe2dd8 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
   return out_var;
 }
 
-std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
-                                              "relu6", "relux", "tanh",
-                                              "band_pass"});
+std::unordered_set<std::string> conv_act_set({"identity", "relu"});
 
 PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
   conv_in->AsInput();
@@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
                                   ->AsInput();
   auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
                                  ->assert_is_op_output("elementwise_add")
-                                 ->assert_is_op_input("elementwise_add", "X")
+                                 ->assert_is_op_input("elementwise_add", "Y")
                                  ->AsIntermediate();
 
   auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
                                   ->assert_is_op("elementwise_add");
   auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
-                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->assert_is_op_input("elementwise_add", "X")
                                     ->AsInput();
   auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
                                    ->assert_is_op_output("elementwise_add")
@@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
   conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
   elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
       .LinksTo({elementwise_add_out});
-  elementwise_add_op_1->LinksFrom(
-      {elementwise_add_out, elementwise_add_in_y_1});
+  elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1})
+      .LinksTo({elementwise_add_out_1});
   act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
   return act_out;
 }
@@ -1236,6 +1234,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
   return elementwise_add_out;
 }
 
+PDNode *patterns::ConvAffineChannel::operator()(
+    paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+
+  PDNode *eltwise_op = nullptr;
+  if (with_eltwise_add) {
+    eltwise_op =
+        pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
+  }
+
+  auto *affine_channel_op =
+      pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel");
+  // Create variables
+  // Conv Filter
+  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+
+  auto *conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d");
+
+  PDNode *eltwise_y_in_var = nullptr;
+  PDNode *eltwise_out_var = nullptr;
+  if (with_eltwise_add) {
+    // Conv output as Bias input
+    conv_out_var->assert_is_op_input("elementwise_add", "X");
+    // Bias
+    eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr())
+                           ->assert_is_op_input("elementwise_add", "Y")
+                           ->AsInput();
+    eltwise_out_var = pattern->NewNode(eltwise_out_repr())
+                          ->AsIntermediate()
+                          ->assert_is_only_output_of_op("elementwise_add");
+  } else {
+    // Conv output as AffineChannel input
+    conv_out_var->assert_is_op_input("affine_channel", "X");
+  }
+
+  // AC Scale
+  auto *ac_scale_var = pattern->NewNode(ac_scale_repr())
+                           ->AsInput()
+                           ->assert_is_persistable_var()
+                           ->assert_is_op_input("affine_channel", "Scale");
+  // AC Bias
+  auto *ac_bias_var = pattern->NewNode(ac_bias_repr())
+                          ->AsInput()
+                          ->assert_is_persistable_var()
+                          ->assert_is_op_input("affine_channel", "Bias");
+
+  // AC output
+  auto *ac_out_var = pattern->NewNode(ac_out_repr())
+                         ->AsOutput()
+                         ->assert_is_op_output("affine_channel");
+
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
+
+  if (with_eltwise_add) {
+    eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var})
+        .LinksTo({eltwise_out_var});
+    affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var})
+        .LinksTo({ac_out_var});
+  } else {
+    affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var})
+        .LinksTo({ac_out_var});
+  }
+  return ac_out_var;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index eaedd9d08e..61a5300344 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase {
   PATTERN_DECL_NODE(elementwise_add_out);
 };
 
+// Conv with affine_channel
+// op: conv + (elementwise_add +) affine_channel
+// named nodes:
+// conv_weight, conv_out, conv,
+// ac_x, ac_scale, ac_bias
+// affine_channel, ac_out
+struct ConvAffineChannel : public PatternBase {
+  ConvAffineChannel(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_affine_channel") {}
+
+  PDNode* operator()(PDNode* conv_input, bool with_eltwise_add);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(affine_channel);
+  PATTERN_DECL_NODE(eltwise);  // ELEMENTWISE_ADD
+  // CONV inputs
+  PATTERN_DECL_NODE(conv_weight);  // Filter
+  // CONV outputs
+  PATTERN_DECL_NODE(conv_out);  // tmp
+  // ELTWISE inputs
+  PATTERN_DECL_NODE(eltwise_y_in);
+  // ELTWISE outputs
+  PATTERN_DECL_NODE(eltwise_out);  // tmp
+
+  // AC(Affine_Channel) inputs
+  PATTERN_DECL_NODE(ac_scale);
+  PATTERN_DECL_NODE(ac_bias);
+  // AC outputs
+  PATTERN_DECL_NODE(ac_out);  // Out
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index bd5b76426e..9e77f98e9e 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -75,6 +75,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
   std::vector<Node*> optimize_ops;
   std::vector<Node*> lr_ops;  // ops other than forward/backward/optimize
   std::unordered_set<std::string> grad_names;
+  std::unordered_map<std::string, std::string> gradname2paramname;
 
   std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
   auto origin_nodes = graph->ReleaseNodes();
@@ -99,6 +100,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
       auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
       for (size_t i = 0; i < op_role_vars.size(); i += 2) {
         grad_names.insert(op_role_vars[i + 1]);
+        gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i];
       }
     } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
       lr_ops.push_back(node);
@@ -109,7 +111,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
 
   // 2. copy forward backward
   ir::Node* prev_repeat_last_op_node = nullptr;
-  // record origin_grad -> repeated grad list map.
+  // record origin_grad -> repeated_grad_list map.
   std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
   std::map<std::string, std::vector<ir::Node*>> created;
   std::unordered_set<std::string> bn_vars_need_rename;
@@ -124,10 +126,16 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
         if (grad_names.find(outname) != grad_names.end()) {
           std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
           repeated_op.RenameOutput(outname, new_gname);
+          // remove op_role_var for backward ops that outputs grad for a
+          // parameter.
+          repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+                              std::vector<std::string>());
         }
       }
       // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
-      // not need this update
+      // not need this update, because only moving mean and variance should be
+      // differ, trainable parameter scale and bias is the same as other
+      // parameters.
       if (node->Name() == "batch_norm") {
         // NOTE: assume bn op created by layers use save var as output mean and
         // variance
@@ -224,16 +232,25 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
         var->inputs.push_back(repeated_node);
       }
     }
-  }
+  }  // end copy forward backward
 
-  // 5. create GRAD merge op node
+  // 5. create GRAD merge op node: sum(repeat.0...repeat.n) ->
+  // scale(1/num_repeats)
   for (auto kv : grad_repeated_map) {
     OpDesc sum_op;
     sum_op.SetType("sum");
     std::vector<std::string> repeated_grad_names;
+    std::vector<std::string> param_grad_op_role_var;
     for (auto r : kv.second) {
       repeated_grad_names.push_back(r->Var()->Name());
     }
+    // NOTE: use op_role_var to control allreduce op appending in
+    //       multi_devices_graph_pass, we want to append op_role_var
+    //       only once for the merged gradient, so break after first call.
+    param_grad_op_role_var.push_back(
+        gradname2paramname.at(kv.first->Var()->Name()));        // param
+    param_grad_op_role_var.push_back(kv.first->Var()->Name());  // grad
+
     sum_op.SetInput("X", repeated_grad_names);
     sum_op.SetOutput("Out", {kv.first->Var()->Name()});
     sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
@@ -256,6 +273,10 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
     scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
     scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
                      static_cast<int>(OpRole::kBackward));
+
+    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+                     param_grad_op_role_var);
+
     auto scale_op_node = result.CreateOpNode(&scale_op);
     scale_op_node->inputs.push_back(sum_out_var_node);
     sum_out_var_node->outputs.push_back(scale_op_node);
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 6940250c3f..c3a044d22c 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -215,8 +215,8 @@ class Vector {
       auto stream = dev_ctx->stream();
       void *src = gpu_->ptr();
       void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                   gpu_->size(), stream);
+      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
+                           gpu_->size(), stream);
       dev_ctx->Wait();
     }
 
@@ -261,8 +261,8 @@ class Vector {
       auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
           platform::DeviceContextPool::Instance().Get(place));
       auto stream = dev_ctx->stream();
-      memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                   gpu_->size(), stream);
+      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
+                           gpu_->size(), stream);
     }
 
     void ImmutableCPU() const {
@@ -284,7 +284,7 @@ class Vector {
     bool IsInCPU() const { return flag_ & kDataInCPU; }
 
     mutable std::vector<T> cpu_;
-    mutable memory::AllocationPtr gpu_;
+    mutable paddle::memory::AllocationPtr gpu_;
     mutable int flag_;
 
     mutable std::mutex mtx_;
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index f1642bc0d2..86e6b1f7d9 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -40,14 +40,14 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
 
 void NaiveExecutor::Run() {
 #ifndef PADDLE_ON_INFERENCE
-  LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the "
-                              "cmake flag ON_INFER is not set.";
-  LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and "
-                              "variables will be reused to save the allocation "
-                              "overhead.";
-  LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by "
-                              "setting the cmake flag ON_INFER=ON if you are "
-                              "running Paddle Inference";
+  LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
+                             "cmake flag ON_INFER is not set.";
+  LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
+                             "variables will be reused to save the allocation "
+                             "overhead.";
+  LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
+                             "setting the cmake flag ON_INFER=ON if you are "
+                             "running Paddle Inference";
 #endif  // PADDLE_ON_INFERENCE
   for (auto &op : ops_) {
     VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
index 42190b5228..b083493ba4 100644
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -32,8 +32,11 @@ std::map<std::string,
                                 std::string, std::shared_ptr<ngraph::Node>>>)>>
     NgraphBridge::NG_NODE_MAP = {
         {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
+        {"mean", paddle::operators::ngraphs::BuildMeanNode},
+        {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode},
         {"mul", paddle::operators::ngraphs::BuildMulNode},
         {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
+        {"scale", paddle::operators::ngraphs::BuildScaleNode},
         {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
         {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
         {"top_k", paddle::operators::ngraphs::BuildTopKNode}};
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index 23f681ce88..7e174c7def 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -399,7 +399,7 @@ void NgraphEngine::BuildNgFunction() {
   BuildNgNodes();
   ngraph_function_ = nullptr;
   ngraph::NodeVector func_outputs;
-  ngraph::op::ParameterVector func_inputs;
+  ngraph::ParameterVector func_inputs;
 
   for (auto& vo : var_out_) {
     func_outputs.push_back(var_node_map_->at(vo));
@@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
     }
   }
 
-  backend_->call(ngraph_function_, t_out, t_in);
+  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
 }  // NgraphEngine::RunImpl
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 2311614c33..ca31303f77 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -82,10 +82,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
   AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
       .SetDefault("");
 
-  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
-                                    "Callstack for Op Creatation.")
-      .SetDefault({});
-
   Validate();
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 0a0f8f4655..4c59c73d87 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -47,7 +47,6 @@ class OpProtoAndCheckerMaker {
   static const char *OpRoleAttrName() { return "op_role"; }
   static const char *OpRoleVarAttrName() { return "op_role_var"; }
   static const char *OpNamescopeAttrName() { return "op_namescope"; }
-  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
 
   void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 6d39bb3c52..2c1648c81f 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,7 +23,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 
-#include "glog/logging.h"  // For VLOG()
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "glog/logging.h"               // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ac2828136b..f10da22aec 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -16,15 +16,9 @@ limitations under the License. */
 #include <glog/logging.h>
 
 #include <algorithm>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
@@ -162,67 +156,31 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames,
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  try {
-    if (VLOG_IS_ON(4)) {
-      VLOG(4) << place << " " << DebugStringEx(&scope);
-    }
-    if (platform::is_gpu_place(place)) {
+  VLOG(4) << place << " " << DebugStringEx(&scope);
+  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
+    PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-      platform::SetDeviceId(dev_id);
+    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+    platform::SetDeviceId(dev_id);
 #endif
-    }
-
-    // The profile has a process-wide mutex, results in serious performance
-    // issue
-    // in concurrency scenerio. Here use an `if` to fix this issue.
-    // Please not remove the `if`, ask @Superjomn if there are any concern.
-    if (platform::IsProfileEnabled()) {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      platform::RecordEvent record_event(Type(), pool.Get(place));
-      RunImpl(scope, place);
-    } else {
-      RunImpl(scope, place);
-    }
-
-    if (VLOG_IS_ON(3)) {
-      VLOG(3) << place << " " << DebugStringEx(&scope);
-    }
-  } catch (platform::EnforceNotMet exception) {
-    if (Attrs().count("sub_block") != 0) {
-      throw exception;
-    }
-
-    auto& callstack = Attr<std::vector<std::string>>(
-        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+  }
 
-    if (callstack.empty()) {
-      throw exception;
-    }
-    std::ostringstream sout;
-    sout << "Invoke operator " << Type() << " error.\n";
-    sout << "Python Callstacks: \n";
-    for (auto& line : callstack) {
-      sout << line;
-    }
-    sout << "C++ Callstacks: \n";
-    sout << exception.err_str_;
-    exception.err_str_ = sout.str();
-    throw exception;
-  } catch (...) {
-    std::rethrow_exception(std::current_exception());
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
+  if (platform::IsProfileEnabled()) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+    RunImpl(scope, place);
+  } else {
+    RunImpl(scope, place);
   }
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
-  if (inputs_.find(name) != inputs_.end()) {
-    return true;
-  } else {
-    return false;
-  }
+  return inputs_.find(name) != inputs_.end();
 }
 
 std::string OperatorBase::Input(const std::string& name) const {
@@ -421,7 +379,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
     return &(var.Get<SelectedRows>().value());
   } else {
     PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var.Type().name());
+                 ToTypeName(var.Type()));
   }
 }
 
@@ -432,7 +390,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
     return var->GetMutable<SelectedRows>()->mutable_value();
   } else {
     PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var->Type().name());
+                 ToTypeName(var->Type()));
   }
 }
 
@@ -526,7 +484,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
                    PADDLE_ENFORCE(
                        var->IsType<LoDTensor>(),
                        "should be LoDTensor, but the received type is %s",
-                       var->Type().name());
+                       ToTypeName(var->Type()));
                    return &(var->Get<LoDTensor>());
                  });
   return res;
@@ -545,7 +503,7 @@ const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
                    PADDLE_ENFORCE(
                        var->IsType<LoDTensor>(),
                        "%s should be LoDTensor, but the received type is %s",
-                       sub_name, var->Type().name());
+                       sub_name, ToTypeName(var->Type()));
                    return &(var->Get<LoDTensor>());
                  });
   return res;
@@ -574,7 +532,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
                    PADDLE_ENFORCE(
                        var->IsType<LoDTensor>(),
                        "%s should be LoDTensor, but the received type is %s",
-                       sub_name, var->Type().name());
+                       sub_name, ToTypeName(var->Type()));
                    return var->GetMutable<LoDTensor>();
                  });
   return res;
@@ -816,7 +774,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
       PADDLE_THROW(
           "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
           "type_id is %s.",
-          var->Type().name());
+          ToTypeName(var->Type()));
     }
   }
 
@@ -839,7 +797,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
     } else {
       PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                   var->Type().name());
+                   ToTypeName(var->Type()));
     }
   }
 
@@ -1082,12 +1040,11 @@ Scope* OperatorWithKernel::PrepareData(
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
-  auto& scope = ctx.scope();
   int data_type = -1;
-  std::string last_input_name;
   for (auto& input : this->inputs_) {
-    for (auto& ipt_name : input.second) {
-      auto* var = scope.FindVar(ipt_name);
+    const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
+    for (size_t i = 0; i < vars.size(); ++i) {
+      const Variable* var = vars[i];
       if (var != nullptr) {
         const Tensor* t = nullptr;
         if (var->IsType<Tensor>()) {
@@ -1098,15 +1055,14 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized",
-                         ipt_name);
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
+                         input.first, i);
           int tmp = static_cast<int>(t->type());
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
-              "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
-              Type(), last_input_name, data_type, ipt_name, tmp);
+              "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
+              Type(), data_type, tmp);
           data_type = tmp;
-          last_input_name = ipt_name;
         }
       }
     }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 1fe2daacf1..4d29564aee 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@";
 /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
 constexpr char kGradVarSuffix[] = "@GRAD";
 
+constexpr size_t kGradVarSuffixSize = 5U;
+
 /// Variables with this suffix are supposed to be filled up with zeros.
 constexpr char kZeroVarSuffix[] = "@ZERO";
 
@@ -60,7 +62,20 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
 
 inline std::string GradVarName(const std::string& var_name) {
-  return var_name + kGradVarSuffix;
+  std::string result;
+  result.reserve(var_name.size() + kGradVarSuffixSize);
+  result += var_name;
+  result += kGradVarSuffix;
+  return result;
+}
+
+inline std::string GradOriginalVarName(const std::string& grad_var_name) {
+  std::size_t pos = grad_var_name.rfind(kGradVarSuffix);
+  if (pos == std::string::npos) {
+    return grad_var_name;
+  } else {
+    return grad_var_name.substr(0, pos);
+  }
 }
 
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
@@ -75,6 +90,10 @@ class RuntimeContext {
   RuntimeContext(const VariableNameMap& innames,
                  const VariableNameMap& outnames, const Scope& scope);
 
+  RuntimeContext(const VariableValueMap& invars,
+                 const VariableValueMap& outvars)
+      : inputs(invars), outputs(outvars) {}
+
   VariableValueMap inputs;
   VariableValueMap outputs;
 };
@@ -110,8 +129,8 @@ class OperatorBase {
   bool HasAttr(const std::string& name) const { return attrs_.count(name); }
   template <typename T>
   inline const T& Attr(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
-                   name);
+    PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(),
+                   "%s should be in AttributeMap", name);
     return boost::get<T>(attrs_.at(name));
   }
   const AttributeMap& Attrs() const { return attrs_; }
@@ -358,6 +377,30 @@ class ExecutionContext {
     return op_.Outputs(name);
   }
 
+  template <typename T, typename DevContext>
+  Tensor AllocateTmpTensor(const framework::DDim& dim,
+                           const DevContext& dev_ctx) const {
+    auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance()
+                                  .Get<DevContext>(dev_ctx)
+                                  .Allocate(product(dim) * sizeof(T));
+    auto& deleter = tmp_allocation_ptr.get_deleter();
+    auto* allocation_ptr = tmp_allocation_ptr.release();
+    auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
+        allocation_ptr, deleter);
+
+    PADDLE_ENFORCE(
+        dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
+        "The AllocationPtr must be TemporaryAllocation.");
+    PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+                      framework::product(dim) * sizeof(T));
+
+    paddle::framework::Tensor temp_tensor(
+        framework::ToDataType(std::type_index(typeid(T))));
+    temp_tensor.Resize(dim);
+    temp_tensor.ResetHolder(std::move(shared_allocation));
+    return temp_tensor;
+  }
+
  private:
   const OperatorBase& op_;
   const Scope& scope_;
@@ -441,8 +484,9 @@ class OperatorWithKernel : public OperatorBase {
   void RuntimeInferShape(const Scope& scope, const platform::Place& place,
                          const RuntimeContext& ctx) const override;
 
- protected:
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
+
+ protected:
   virtual OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
       const OpKernelType& expected_kernel_type) const;
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index ab14732e4d..fe4804ac25 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -288,3 +288,30 @@ TEST(OpKernel, multi_inputs) {
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_place);
 }
+
+TEST(VarNameTest, all) {
+  std::string var_name("X");
+  std::string grad_var_name = paddle::framework::GradVarName(var_name);
+  ASSERT_EQ(grad_var_name, "X@GRAD");
+  std::string original_var_name =
+      paddle::framework::GradOriginalVarName(grad_var_name);
+  ASSERT_EQ(original_var_name, "X");
+  original_var_name = paddle::framework::GradOriginalVarName(original_var_name);
+  ASSERT_EQ(original_var_name, "X");
+
+  std::string var_name_2("XYZ");
+  grad_var_name = paddle::framework::GradVarName(var_name_2);
+  ASSERT_EQ(grad_var_name, "XYZ@GRAD");
+  original_var_name = paddle::framework::GradOriginalVarName(grad_var_name);
+  ASSERT_EQ(original_var_name, "XYZ");
+  original_var_name = paddle::framework::GradOriginalVarName(original_var_name);
+  ASSERT_EQ(original_var_name, "XYZ");
+
+  std::string var_name_3("");
+  grad_var_name = paddle::framework::GradVarName(var_name_3);
+  ASSERT_EQ(grad_var_name, "@GRAD");
+  original_var_name = paddle::framework::GradOriginalVarName(grad_var_name);
+  ASSERT_EQ(original_var_name, "");
+  original_var_name = paddle::framework::GradOriginalVarName(original_var_name);
+  ASSERT_EQ(original_var_name, "");
+}
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index a921f469f5..450fe1508f 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -21,12 +21,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
@@ -38,6 +35,8 @@ limitations under the License. */
 DEFINE_string(pe_profile_fname, "",
               "Profiler filename for PE, which generated by gperftools."
               "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
+DEFINE_bool(enable_parallel_graph, false,
+            "Force disable parallel graph execution mode if set false.");
 
 namespace paddle {
 namespace framework {
@@ -106,6 +105,7 @@ class ParallelExecutorPrivate {
   bool own_local_scope_;
   bool use_cuda_;
   bool use_all_reduce_;
+  size_t nranks_;
 
   // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and
   // then keeps unchanged
@@ -201,6 +201,7 @@ ParallelExecutor::ParallelExecutor(
   member_->build_strategy_ = build_strategy;
   member_->use_all_reduce_ =
       build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
+  member_->nranks_ = num_trainers * places.size();
 
   if (!member_->use_all_reduce_) {
     PADDLE_ENFORCE(places.size() > 1,
@@ -224,62 +225,98 @@ ParallelExecutor::ParallelExecutor(
     }
   }
 
+  // FIXME(Yancey1989): parallel graph mode get better performance
+  // in GPU allreduce distributed training. Need an elegant way to
+  // choice the execution strategy.
+  build_strategy.enable_parallel_graph_ =
+      EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
+
+  VLOG(1) << "Enable ParallelGraph Execution: "
+          << build_strategy.enable_parallel_graph_;
+
   if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
     ncclUniqueId *nccl_id = nullptr;
+    // gen_nccl_id operator can broadcast the ncclUniqueId for nccl2 collective
+    // distributed training
+    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
     if (nccl_id_var != nullptr) {
       nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
     }
+    if (build_strategy.enable_parallel_graph_ && member_->nranks_ > 1UL) {
+      if (nccl_id == nullptr) {
+        local_nccl_id_.reset(new ncclUniqueId());
+        platform::dynload::ncclGetUniqueId(local_nccl_id_.get());
+        nccl_id = local_nccl_id_.get();
+      }
+    }
+
     member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
         member_->places_, nccl_id, num_trainers, trainer_id));
 #else
     PADDLE_THROW("Not compiled with CUDA");
 #endif
   }
-
   if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
     BCastParamsToDevices(bcast_vars);
   }
-// Startup Program has been run. All local scopes has correct parameters.
+  // Startup Program has been run. All local scopes has correct parameters.
 
-// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-// ncclOp
+  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+  // ncclOp
+  std::vector<std::unique_ptr<ir::Graph>> graphs;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  if (build_strategy.enable_parallel_graph_) {
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
+          main_program, {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
+          member_->nccl_ctxs_.get());
+      graphs.push_back(std::move(graph));
+    }
+  } else {
+    std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
+        main_program, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
+    graphs.push_back(std::move(graph));
+  }
+#else
   std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
       main_program, member_->places_, loss_var_name, member_->local_scopes_,
-      member_->use_cuda_, member_->nccl_ctxs_.get());
-#else
-  std::unique_ptr<ir::Graph> graph =
-      build_strategy.Apply(main_program, member_->places_, loss_var_name,
-                           member_->local_scopes_, member_->use_cuda_);
+      member_->nranks_, member_->use_cuda_);
+  graphs.push_back(std::move(graph));
 #endif
   auto max_memory_size = GetEagerDeletionThreshold();
   if (max_memory_size >= 0) {
-    graph = member_->PrepareGCAndRefCnts(std::move(graph),
-                                         static_cast<size_t>(max_memory_size));
+    for (size_t i = 0; i < graphs.size(); ++i) {
+      graphs[i] = member_->PrepareGCAndRefCnts(
+          std::move(graphs[i]), static_cast<size_t>(max_memory_size));
+    }
   }
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
-  for (auto &node : graph->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos.emplace_back();
-      var_infos.back().name_ = node->Var()->Name();
-      var_infos.back().type_ = node->Var()->GetType();
-      var_infos.back().persistable_ = node->Var()->Persistable();
+  for (auto &graph : graphs) {
+    for (auto &node : graph->Nodes()) {
+      if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+        var_infos.emplace_back();
+        var_infos.back().name_ = node->Var()->Name();
+        var_infos.back().type_ = node->Var()->GetType();
+        var_infos.back().persistable_ = node->Var()->Persistable();
+      }
     }
   }
+
   // If the loss_var_name is given, the number of graph should be only one.
   if (loss_var_name.size()) {
-    size_t graph_num = ir::GraphNum(*graph);
+    size_t graph_num = ir::GraphNum(*graphs[0]);
     if (graph_num > 1) {
       LOG(WARNING)
           << "The number of graph should be only one, "
              "but the current graph has "
-          << ir::GraphNum(*graph)
+          << ir::GraphNum(*graphs[0])
           << " sub_graphs. If you want to see the nodes of the "
              "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
              "to specify the output dir. NOTES: if you not do training, "
@@ -287,14 +324,20 @@ ParallelExecutor::ParallelExecutor(
     }
   }
 
-  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
-    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+  if (build_strategy.enable_parallel_graph_) {
+    member_->executor_.reset(new details::ParallelSSAGraphExecutor(
         exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(graph)));
+        std::move(graphs)));
   } else {
-    member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->places_,
-        std::move(graph)));
+    if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+      member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+          exec_strategy, member_->local_scopes_, member_->places_,
+          std::move(graphs[0])));
+    } else {
+      member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+          exec_strategy, member_->local_scopes_, member_->places_,
+          std::move(graphs[0])));
+    }
   }
 
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
@@ -320,6 +363,7 @@ void ParallelExecutor::BCastParamsToDevices(
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       std::vector<void *> buffers;
+      buffers.reserve(member_->places_.size());
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
       for (size_t i = 0; i < member_->places_.size(); ++i) {
@@ -353,9 +397,7 @@ void ParallelExecutor::BCastParamsToDevices(
 #endif
     } else {
       platform::CPUPlace cpu;
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        if (i == 0) continue;
-
+      for (size_t i = 1; i < member_->places_.size(); ++i) {
         auto local_scope = member_->local_scopes_[i];
         auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
 
@@ -424,6 +466,36 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
   }
 }
 
+bool ParallelExecutor::EnableParallelGraphExecution(
+    const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy,
+    const BuildStrategy &build_strategy) const {
+  if (!FLAGS_enable_parallel_graph) return false;
+
+  bool enable_parallel_graph = true;
+  // TODO(Yancey1989): support sparse update in ParallelGraph mode.
+  for (auto &var_desc : main_program.Block(0).AllVars()) {
+    if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) {
+      enable_parallel_graph = false;
+    }
+  }
+
+  // TODO(Yancey1989): support pserver mode
+  for (auto &op_desc : main_program.Block(0).AllOps()) {
+    if (op_desc->Type() == "send" || op_desc->Type() == "recv") {
+      enable_parallel_graph = false;
+      break;
+    }
+  }
+
+  if (!member_->use_all_reduce_ || !member_->use_cuda_)
+    enable_parallel_graph = false;
+
+  if (build_strategy.enable_sequential_execution_ ||
+      exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
+    enable_parallel_graph = false;
+  return enable_parallel_graph;
+}
+
 ParallelExecutor::~ParallelExecutor() {
   for (auto &p : member_->places_) {
     platform::DeviceContextPool::Instance().Get(p)->Wait();
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 5f6c2159aa..49d3f0d3f6 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -28,6 +28,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -68,8 +72,14 @@ class ParallelExecutor {
 
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
+  bool EnableParallelGraphExecution(const ProgramDesc &main_program,
+                                    const ExecutionStrategy &exec_strategy,
+                                    const BuildStrategy &build_strategy) const;
 
   ParallelExecutorPrivate *member_;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  std::unique_ptr<ncclUniqueId> local_nccl_id_;
+#endif
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index dbf00f3a79..f8aa87519a 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -16,7 +16,9 @@ limitations under the License. */
 
 #if !defined(_WIN32)
 #include <pthread.h>
-#endif  // !_WIN32
+#else
+#include <mutex>  // NOLINT
+#endif            // !_WIN32
 
 #include "paddle/fluid/platform/enforce.h"
 
@@ -29,17 +31,17 @@ struct RWLock {
 
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
-  void RDLock() {
+  inline void RDLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
                       "acquire read lock failed");
   }
 
-  void WRLock() {
+  inline void WRLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
                       "acquire write lock failed");
   }
 
-  void UNLock() {
+  inline void UNLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
   }
 
@@ -51,81 +53,46 @@ struct RWLock {
 // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.
 struct RWLock {
-  void RDLock() {}
-  void WRLock() {}
-  void UNLock() {}
+  // FIXME(minqiyang): use mutex here to do fake lock
+  inline void RDLock() { mutex_.lock(); }
+
+  inline void WRLock() { mutex_.lock(); }
+
+  inline void UNLock() { mutex_.unlock(); }
+
+ private:
+  std::mutex mutex_;
 };
 #endif
 
-class RWLockGuard {
+class AutoWRLock {
  public:
-  enum Status { kUnLock, kWRLock, kRDLock };
-
-  RWLockGuard(RWLock* rw_lock, Status init_status)
-      : lock_(rw_lock), status_(Status::kUnLock) {
-    switch (init_status) {
-      case Status::kRDLock: {
-        RDLock();
-        break;
-      }
-      case Status::kWRLock: {
-        WRLock();
-        break;
-      }
-      case Status::kUnLock: {
-        break;
-      }
-    }
-  }
+  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
 
-  void WRLock() {
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->WRLock();
-        status_ = Status::kWRLock;
-        break;
-      }
-      case Status::kWRLock: {
-        break;
-      }
-      case Status::kRDLock: {
-        PADDLE_THROW(
-            "Please unlock read lock first before invoking write lock.");
-        break;
-      }
-    }
-  }
+  ~AutoWRLock() { UnLock(); }
 
-  void RDLock() {
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->RDLock();
-        status_ = Status::kRDLock;
-        break;
-      }
-      case Status::kRDLock: {
-        break;
-      }
-      case Status::kWRLock: {
-        PADDLE_THROW(
-            "Please unlock write lock first before invoking read lock.");
-        break;
-      }
-    }
-  }
+ private:
+  inline void Lock() { lock_->WRLock(); }
 
-  void UnLock() {
-    if (status_ != Status::kUnLock) {
-      lock_->UNLock();
-      status_ = Status::kUnLock;
-    }
-  }
+  inline void UnLock() { lock_->UNLock(); }
+
+ private:
+  RWLock* lock_;
+};
+
+class AutoRDLock {
+ public:
+  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+
+  ~AutoRDLock() { UnLock(); }
+
+ private:
+  inline void Lock() { lock_->RDLock(); }
 
-  ~RWLockGuard() { UnLock(); }
+  inline void UnLock() { lock_->UNLock(); }
 
  private:
   RWLock* lock_;
-  Status status_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 6fa5e99f9f..a5742dbd3d 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false,
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_LOCK_GUARD
+#define SCOPE_KIDS_READER_LOCK
+#define SCOPE_KIDS_WRITER_LOCK
+#define SCOPE_VARS_READER_LOCK
+#define SCOPE_VARS_WRITER_LOCK
 #else
-#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
+#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
+#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
+#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
+#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
 #endif
 
 namespace paddle {
@@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  SCOPE_LOCK_GUARD
-  kids_.push_back(new Scope(this));
-  return *kids_.back();
+  Scope* child = new Scope(this);
+  {
+    SCOPE_KIDS_WRITER_LOCK
+    kids_.push_back(child);
+  }
+  return *child;
 }
 
 Variable* Scope::Var(const std::string& name) {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  SCOPE_LOCK_GUARD
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
   }
+  SCOPE_VARS_WRITER_LOCK
   return VarInternal(new_name);
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
   return FindVarInternal(name);
 }
 
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
   return FindVarLocally(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_READER_LOCK
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_WRITER_LOCK
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_READER_LOCK
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
-  SCOPE_LOCK_GUARD
   std::vector<std::string> known_vars;
-  known_vars.reserve(this->vars_.size());
-  for (auto& p : vars_) {
-    known_vars.emplace_back(p.first);
+  {
+    SCOPE_VARS_READER_LOCK
+    known_vars.reserve(this->vars_.size());
+    for (auto& p : vars_) {
+      known_vars.emplace_back(p.first);
+    }
   }
   return known_vars;
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_KIDS_WRITER_LOCK
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                  this, scope);
@@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  SCOPE_LOCK_GUARD
   std::set<std::string> var_set(var_names.begin(), var_names.end());
+  SCOPE_VARS_WRITER_LOCK
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
       it = vars_.erase(it);
@@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_LOCK_GUARD
+  SCOPE_VARS_WRITER_LOCK
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
@@ -165,11 +176,9 @@ std::string Scope::Rename(const std::string& origin_name) const {
 Variable* Scope::VarInternal(const std::string& name) {
   auto* v = FindVarLocally(name);
   if (v != nullptr) return v;
-
   v = new Variable();
-  vars_[name].reset(v);
+  vars_.emplace(name, std::unique_ptr<Variable>(v));
   VLOG(3) << "Create variable " << name;
-  v->name_ = &(vars_.find(name)->first);
   return v;
 }
 
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index aded1f771c..f0915d2eee 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -14,12 +14,18 @@ limitations under the License. */
 
 #pragma once
 
+extern "C" {
+#include <xxhash.h>
+}
+
 #include <list>
-#include <mutex>  // NOLINT
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -95,7 +101,14 @@ class Scope {
   std::string Rename(const std::string& origin_name) const;
 
  protected:
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  struct KeyHasher {
+    std::size_t operator()(const std::string& key) const {
+      return XXH32(key.c_str(), key.size(), 1);
+    }
+  };
+
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>, KeyHasher>
+      vars_;
 
  private:
   // Call Scope::NewScope for a sub-scope.
@@ -124,7 +137,8 @@ class Scope {
   DISABLE_COPY_AND_ASSIGN(Scope);
 
  private:
-  mutable std::mutex mutex_;
+  mutable RWLock kids_lock_;
+  mutable RWLock vars_lock_;
 };
 
 // Generate some debug string about the inherience structure of scope, quite
diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc
new file mode 100644
index 0000000000..5cb241a7a3
--- /dev/null
+++ b/paddle/fluid/framework/scope_pool.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/scope_pool.h"
+#include "paddle/fluid/framework/threadpool.h"
+
+namespace paddle {
+namespace framework {
+
+ScopePool &ScopePool::Instance() {  // NOLINT
+  static ScopePool pool;
+  return pool;
+}
+
+void ScopePool::DeleteScope(Scope *scope) { delete scope; }
+
+void ScopePool::Insert(std::unique_ptr<Scope> &&s) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  scopes_.insert(s.release());
+}
+
+void ScopePool::Remove(Scope *s) {
+  size_t has_scope;
+  {
+    std::lock_guard<std::mutex> guard(mtx_);
+    has_scope = scopes_.erase(s);
+  }
+  PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope");
+  DeleteScope(s);
+}
+
+ScopePool::~ScopePool() { Clear(); }
+
+void ScopePool::Clear() {
+  std::lock_guard<std::mutex> guard(mtx_);
+  for (auto *s : scopes_) {
+    DeleteScope(s);
+  }
+  scopes_.clear();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h b/paddle/fluid/framework/scope_pool.h
similarity index 64%
rename from paddle/fluid/framework/details/multi_devices_graph_check_pass.h
rename to paddle/fluid/framework/scope_pool.h
index 1e2b1867c3..a8b468699a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
+++ b/paddle/fluid/framework/scope_pool.h
@@ -14,25 +14,33 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-
-#include <string>
+#include <mutex>  // NOLINT
+#include <unordered_set>
+#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
-namespace details {
 
-class SSAGraghBuilderWithChecker : public ir::Pass {
- protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
-      std::unique_ptr<ir::Graph> graph) const override {
-    PADDLE_ENFORCE(IsValidGraph(graph.get()));
-    return graph;
-  }
+class ScopePool {
+ public:
+  static ScopePool &Instance();  // NOLINT
+
+  void Insert(std::unique_ptr<Scope> &&s);
+
+  void Remove(Scope *s);
+
+  void Clear();
+
+  ~ScopePool();
+
+ private:
+  ScopePool() = default;
+
+  static void DeleteScope(Scope *scope);
 
-  bool IsValidGraph(const ir::Graph* graph) const;
+  std::unordered_set<Scope *> scopes_;
+  std::mutex mtx_;
 };
 
-}  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 871c7bd2a7..1ffd357e62 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -151,27 +151,5 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
   memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
                src_ptr, size);
 }
-
-template <typename T>
-paddle::framework::Tensor GetTensor(
-    memory::allocation::AllocationPtr temp_allocation_ptr,
-    const framework::DDim& dim) {
-  auto& deleter = temp_allocation_ptr.get_deleter();
-  auto* allocation_ptr = temp_allocation_ptr.release();
-  auto shared_allocation =
-      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
-
-  PADDLE_ENFORCE(
-      dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
-      "The AllocationPtr must be TemporaryAllocation.");
-  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
-                    framework::product(dim) * sizeof(T));
-
-  paddle::framework::Tensor temp_tensor(
-      framework::ToDataType(std::type_index(typeid(T))));
-  temp_tensor.Resize(dim);
-  temp_tensor.ResetHolder(std::move(shared_allocation));
-  return temp_tensor;
-}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index fcec955360..d34f826c1a 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -89,7 +89,6 @@ void ThreadPool::TaskLoop() {
       task = std::move(tasks_.front());
       tasks_.pop();
     }
-
     // run the task
     task();
   }
diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h
new file mode 100644
index 0000000000..731da74eff
--- /dev/null
+++ b/paddle/fluid/framework/unroll_array_ops.h
@@ -0,0 +1,179 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstddef>
+#include <type_traits>
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace framework {
+
+namespace detail {
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollFillConstant {
+  template <typename T>
+  HOSTDEVICE inline static void Run(T *data, T val) {
+    data[kStart] = val;
+    UnrollFillConstant<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(data, val);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollFillConstant<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(T *data, T val) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollAssign {
+  template <typename Tin, typename Tout>
+  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {
+    d2[kStart] = static_cast<Tout>(d1[kStart]);
+    UnrollAssign<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollAssign<kStart, kEnd, true> {
+  template <typename Tin, typename Tout>
+  HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {}
+};
+
+template <typename T, size_t kStart, size_t kEnd, bool kStop>
+struct UnrollVarArgsAssignImpl {
+  template <typename... Args>
+  HOSTDEVICE inline static void Run(T *d, T val, Args... args) {
+    static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument");
+    d[kStart] = val;
+    UnrollVarArgsAssignImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd>::Run(
+        d, args...);
+  }
+};
+
+template <typename T, size_t kStart, size_t kEnd>
+struct UnrollVarArgsAssignImpl<T, kStart, kEnd, true> {
+  HOSTDEVICE inline static void Run(T *d) {}
+};
+
+template <typename T>
+struct UnrollVarArgsAssign {
+  template <typename... Args>
+  HOSTDEVICE inline static void Run(T *d, Args... args) {
+    UnrollVarArgsAssignImpl<T, 0, sizeof...(Args), sizeof...(Args) == 0>::Run(
+        d, args...);
+  }
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollCompare {
+  template <typename T>
+  HOSTDEVICE inline static bool Run(const T *d1, const T *d2) {
+    return d1[kStart] == d2[kStart] &&
+           UnrollCompare<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollCompare<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) {
+    return true;
+  }
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollAdd {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
+    d3[kStart] = d1[kStart] + d2[kStart];
+    UnrollAdd<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollAdd<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollMul {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {
+    d3[kStart] = d1[kStart] * d2[kStart];
+    UnrollMul<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2, d3);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollMul<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {}
+};
+
+template <size_t kStart, size_t kEnd, bool kStop>
+struct UnrollProduct {
+  template <typename T>
+  HOSTDEVICE inline static T Run(const T *d) {
+    return d[kStart] *
+           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d);
+  }
+
+  template <typename T>
+  HOSTDEVICE inline static T Run(const T *d1, const T *d2) {
+    return d1[kStart] * d2[kStart] +
+           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d1, d2);
+  }
+};
+
+template <size_t kStart, size_t kEnd>
+struct UnrollProduct<kStart, kEnd, true> {
+  template <typename T>
+  HOSTDEVICE inline constexpr static T Run(const T *d) {
+    return 1;
+  }
+
+  template <typename T>
+  HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) {
+    return 0;
+  }
+};
+
+}  // namespace detail
+
+template <size_t N>
+using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>;
+
+template <size_t N>
+using UnrollAssign = detail::UnrollAssign<0, N, N == 0>;
+
+template <typename T>
+using UnrollVarArgsAssign = detail::UnrollVarArgsAssign<T>;
+
+template <size_t N>
+using UnrollCompare = detail::UnrollCompare<0, N, N == 0>;
+
+template <size_t N>
+using UnrollAdd = detail::UnrollAdd<0, N, N == 0>;
+
+template <size_t N>
+using UnrollMul = detail::UnrollMul<0, N, N == 0>;
+
+template <size_t N>
+using UnrollProduct = detail::UnrollProduct<0, N, N == 0>;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/unroll_array_ops_test.cc b/paddle/fluid/framework/unroll_array_ops_test.cc
new file mode 100644
index 0000000000..51433c83c8
--- /dev/null
+++ b/paddle/fluid/framework/unroll_array_ops_test.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/unroll_array_ops.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+bool CheckEquality(const T* p, size_t n, T val) {
+  return std::all_of(p, p + n, [val](const T& v) { return v == val; });
+}
+
+template <int D1, int D2>
+bool FillConstantTestMain() {
+  static_assert(D1 >= D2, "");
+  std::array<int, D1> arr;
+  arr.fill(0);
+
+  UnrollFillConstant<D2>::Run(arr.data(), 1);
+  return CheckEquality(arr.data(), D2, 1) &&
+         CheckEquality(arr.data() + D2, arr.size() - D2, 0);
+}
+
+TEST(unroll_ops, fill_constant) {
+  EXPECT_TRUE((FillConstantTestMain<9, 0>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 1>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 4>()));
+  EXPECT_TRUE((FillConstantTestMain<9, 9>()));
+}
+
+TEST(unroll_ops, assign) {
+  const int a[] = {1, 2, 3, 4, 5};
+  int b[] = {0, 0, 0, 0, 0};
+  UnrollAssign<3>::Run(a, b);
+  EXPECT_EQ(b[0], 1);
+  EXPECT_EQ(b[1], 2);
+  EXPECT_EQ(b[2], 3);
+  EXPECT_EQ(b[3], 0);
+  EXPECT_EQ(b[4], 0);
+}
+
+TEST(unroll_ops, var_args_assign) {
+  int a[] = {0, 0, 0};
+  UnrollVarArgsAssign<int>::Run(a, 1, 2);
+  EXPECT_EQ(a[0], 1);
+  EXPECT_EQ(a[1], 2);
+  EXPECT_EQ(a[2], 0);
+}
+
+TEST(unroll_ops, compare) {
+  int a[] = {1, 2, 3};
+  int b[] = {1, 2, 4};
+  EXPECT_TRUE(UnrollCompare<2>::Run(a, b));
+  EXPECT_FALSE(UnrollCompare<3>::Run(a, b));
+
+  b[0] = -1;
+  EXPECT_TRUE(UnrollCompare<0>::Run(a, b));
+  EXPECT_FALSE(UnrollCompare<1>::Run(a, b));
+}
+
+TEST(unroll_ops, add) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+  int c[] = {0, 0, 0};
+  UnrollAdd<2>::Run(a, b, c);
+  EXPECT_EQ(a[0] + b[0], c[0]);
+  EXPECT_EQ(a[1] + b[1], c[1]);
+  EXPECT_EQ(c[2], 0);
+}
+
+TEST(unroll_ops, mul) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+  int c[] = {0, 0, 0};
+  UnrollMul<2>::Run(a, b, c);
+  EXPECT_EQ(a[0] * b[0], c[0]);
+  EXPECT_EQ(a[1] * b[1], c[1]);
+  EXPECT_EQ(c[2], 0);
+}
+
+TEST(unroll_ops, product) {
+  int a[] = {2, 3, 4};
+  int b[] = {5, 10, 102};
+
+  EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]);
+
+  EXPECT_EQ(UnrollProduct<3>::Run(a, b),
+            a[0] * b[0] + a[1] * b[1] + a[2] * b[2]);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 3b6f1cdb8f..73be446f71 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -19,52 +19,50 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace framework {
 
 template <typename T>
-inline bool IsType(const std::type_index& type_index) {
-  return type_index == std::type_index(typeid(T));
+inline bool IsType(const std::type_index& type) {
+  return type == typeid(T);
 }
 
-inline proto::VarType::Type ToVarType(std::type_index type) {
-  if (IsType<LoDTensor>(type)) {
-    return proto::VarType_Type_LOD_TENSOR;
-  } else if (IsType<LoDRankTable>(type)) {
-    return proto::VarType_Type_LOD_RANK_TABLE;
-  } else if (IsType<LoDTensorArray>(type)) {
-    return proto::VarType_Type_LOD_TENSOR_ARRAY;
-  } else if (IsType<SelectedRows>(type)) {
-    return proto::VarType_Type_SELECTED_ROWS;
-  } else if (IsType<ReaderHolder>(type)) {
-    return proto::VarType_Type_READER;
-  } else {
-    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
+inline proto::VarType::Type ToVarType(int type) {
+  switch (type) {
+    case proto::VarType::LOD_TENSOR:
+    case proto::VarType::SELECTED_ROWS:
+    case proto::VarType::LOD_RANK_TABLE:
+    case proto::VarType::LOD_TENSOR_ARRAY:
+    case proto::VarType::READER:
+      return static_cast<proto::VarType::Type>(type);
+    default:
+      PADDLE_THROW("ToVarType:Unsupported type %d", type);
   }
 }
 
 template <typename Visitor>
 inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
-  switch (ToVarType(var.Type())) {
-    case proto::VarType_Type_LOD_TENSOR:
+  switch (var.Type()) {
+    case proto::VarType::LOD_TENSOR:
       visitor(var.Get<LoDTensor>());
       return;
-    case proto::VarType_Type_LOD_RANK_TABLE:
+    case proto::VarType::LOD_RANK_TABLE:
       visitor(var.Get<LoDRankTable>());
       return;
-    case proto::VarType_Type_LOD_TENSOR_ARRAY:
+    case proto::VarType::LOD_TENSOR_ARRAY:
       visitor(var.Get<LoDTensorArray>());
       return;
-    case proto::VarType_Type_SELECTED_ROWS:
+    case proto::VarType::SELECTED_ROWS:
       visitor(var.Get<SelectedRows>());
       return;
-    case proto::VarType_Type_READER:
+    case proto::VarType::READER:
       visitor(var.Get<ReaderHolder>());
       return;
     default:
-      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+      PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
   }
 }
 
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 7842168f60..2a75394fca 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) {
 
   op->InferVarType(prog.MutableBlock(0));
 
-  ASSERT_EQ(proto::VarType_Type_LOD_TENSOR,
+  ASSERT_EQ(proto::VarType::LOD_TENSOR,
             prog.MutableBlock(0)->Var("test2_out")->GetType());
 }
 
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
new file mode 100644
index 0000000000..c3c5bab23b
--- /dev/null
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/platform/macros.h"
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#include <cudnn.h>
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/fluid/operators/cudnn_rnn_cache.h"
+#endif
+
+namespace paddle {
+namespace framework {
+
+// Besides registering variable type id, it is helpful to register a
+// var_id -> std::type_index map (for example, get type names according to id)
+namespace detail {
+
+template <int kStart, int kEnd, bool kStop>
+struct VarIdToTypeIndexMapInitializerImpl {
+  template <typename MapType1, typename MapType2>
+  static void Init(MapType1 *id_to_type, MapType2 *type_to_id) {
+    using Type =
+        typename std::tuple_element<kStart, VarTypeRegistry::ArgTuple>::type;
+    static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
+    constexpr int kId = VarTypeTrait<Type>::kId;
+    auto type = std::type_index(typeid(Type));
+    PADDLE_ENFORCE(id_to_type->count(kId) == 0,
+                   "Registered duplicate type id %d for type %s", kId,
+                   type.name());
+    PADDLE_ENFORCE(type_to_id->count(type) == 0,
+                   "Registered duplicate type_index %s for id %d", type.name(),
+                   kId);
+    id_to_type->emplace(kId, type);
+    type_to_id->emplace(type, kId);
+    VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
+                                       kStart + 1 == kEnd>::Init(id_to_type,
+                                                                 type_to_id);
+  }
+};
+
+template <int kStart, int kEnd>
+struct VarIdToTypeIndexMapInitializerImpl<kStart, kEnd, true> {
+  template <typename MapType1, typename MapType2>
+  static void Init(MapType1 *, MapType2 *) {}
+};
+
+// VarIdToTypeIndexMapInitializer is designed to initialize var_id ->
+// std::type_index map and std::type_index -> var_id map
+using VarIdToTypeIndexMapInitializer =
+    VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum,
+                                       VarTypeRegistry::kRegisteredTypeNum ==
+                                           0>;
+
+struct VarIdToTypeIndexMapHolder {
+  DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder);
+
+ public:
+  static const std::type_index &ToTypeIndex(int var_id) {
+    auto it = Instance().id_to_type_map_.find(var_id);
+    PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
+                   "VarId %d is not registered.", var_id);
+    return it->second;
+  }
+
+  static int ToTypeId(const std::type_index &type) {
+    auto it = Instance().type_to_id_map_.find(type);
+    PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
+                   "VarType %s is not registered.", type.name());
+    return it->second;
+  }
+
+ private:
+  VarIdToTypeIndexMapHolder() {
+    VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_);
+  }
+
+  static const VarIdToTypeIndexMapHolder &Instance() {
+    static const VarIdToTypeIndexMapHolder instance;
+    return instance;
+  }
+
+  std::unordered_map<int, std::type_index> id_to_type_map_;
+  std::unordered_map<std::type_index, int> type_to_id_map_;
+};
+
+}  // namespace detail
+
+const std::type_index &ToTypeIndex(int var_id) {
+  return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id);
+}
+
+const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); }
+
+int ToTypeId(const std::type_index &type) {
+  return detail::VarIdToTypeIndexMapHolder::ToTypeId(type);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
new file mode 100644
index 0000000000..cc68cf2ab8
--- /dev/null
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -0,0 +1,195 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <tuple>
+#include <typeindex>
+#include <vector>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
+#ifndef _WIN32
+#include <nccl.h>
+#endif
+#endif
+
+// Users should add forward declarations here
+namespace paddle {
+
+namespace platform {
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+class Communicator;
+#endif
+#endif
+}  // namespace platform
+
+namespace framework {
+class Tensor;
+class LoDTensor;
+class SelectedRows;
+class LoDRankTable;
+class ReaderHolder;
+class Scope;
+}  // namespace framework
+
+namespace operators {
+template <typename T>
+class AlgorithmsCache;
+
+class CudnnRNNCache;
+
+namespace reader {
+class LoDTensorBlockingQueueHolder;
+}  // namespace reader
+}  // namespace operators
+
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+
+const char *ToTypeName(int var_id);
+const std::type_index &ToTypeIndex(int var_id);
+int ToTypeId(const std::type_index &type);
+
+namespace detail {
+
+template <bool kStop, int kStart, int kEnd, typename T1, typename T2,
+          typename... Args>
+struct TypePosFinderImpl {
+  static constexpr int kPos =
+      std::is_same<T1, T2>::value
+          ? kStart
+          : TypePosFinderImpl<kStart + 2 == kEnd, kStart + 1, kEnd, T1,
+                              Args...>::kPos;
+};
+
+template <int kStart, int kEnd, typename T1, typename T2>
+struct TypePosFinderImpl<true, kStart, kEnd, T1, T2> {
+  static constexpr int kPos = std::is_same<T1, T2>::value ? kStart : -1;
+};
+
+// TypePosFinder helps to find the position in which T is inside Args...
+// If T is not inside Args..., kPos would be -1
+template <typename T, typename... Args>
+struct TypePosFinder {
+  static constexpr int kPos =
+      TypePosFinderImpl<sizeof...(Args) == 1, 0, sizeof...(Args), T,
+                        Args...>::kPos;
+};
+
+template <typename... Args>
+struct VarTypeRegistryImpl {
+  static constexpr size_t kRegisteredTypeNum = sizeof...(Args);
+  using ArgTuple = std::tuple<Args...>;
+
+  // TypePos() returns the position in which T is inside Args...
+  // If T is not inside Args..., return -1
+  template <typename T>
+  static constexpr int TypePos() {
+    return TypePosFinder<T, Args...>::kPos;
+  }
+
+  // IsRegistered() returns whether T is registered inside RegistryImpl
+  template <typename T>
+  static constexpr bool IsRegistered() {
+    return TypePos<T>() >= 0;
+  }
+};
+
+}  // namespace detail
+
+#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id)           \
+  template <>                                              \
+  struct VarTypeTrait<type> {                              \
+    static_assert(VarTypeRegistry::IsRegistered<type>(),   \
+                  "Must be registered type");              \
+    using Type = type;                                     \
+    static constexpr int kId = static_cast<int>(proto_id); \
+  }
+
+/**
+ * The following codes are designed to register variable types.
+ * Only registered types can be stored in Variable.
+ * This registry mechanism is designed to speed up Variable.
+ *
+ * Caution: If you want to add more var types, please consider carefully
+ * whether you really need to add it.
+ */
+
+// Users should add other variable types below.
+// Paddle would generate unique Ids for each registered variable types.
+using VarTypeRegistry = detail::VarTypeRegistryImpl<
+    Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
+    LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
+    std::map<size_t, Tensor>, operators::reader::LoDTensorBlockingQueueHolder,
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+    ncclUniqueId, platform::Communicator,
+#endif
+    operators::AlgorithmsCache<cudnnConvolutionFwdAlgo_t>,
+    operators::AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>,
+    operators::AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>,
+    operators::CudnnRNNCache,
+#endif
+    int, float>;
+
+template <typename T>
+struct VarTypeTrait {
+  static_assert(VarTypeRegistry::IsRegistered<T>(), "Must be registered type");
+  using Type = T;
+  /**
+   * Unique VarType Id generation.
+   *
+   * The auto-generated id should not be the same as any protobuf id defined in
+   * framework.proto. Therefore, we generate id by adding the type pos and
+   * maximum protobuf id (i.e., proto::VarType::TUPLE).
+   *
+   * However, we may need more protobuf id in the future.
+   * To avoid changing this auto id generation algorithm frequently, we
+   * generate id by adding the type pos and twice of maximum protobuf id (i.e.,
+   * proto::VarType::TUPLE).
+   */
+  static constexpr int kId = VarTypeRegistry::TypePos<T>() +
+                             static_cast<int>(proto::VarType::TUPLE) * 2;
+};
+
+// Users should set some of variable type ids to be what is defined in
+// framework.proto below
+REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR);
+REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS);
+REG_PROTO_VAR_TYPE_TRAIT(std::vector<Scope *>, proto::VarType::STEP_SCOPES);
+REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
+REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
+REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
+REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
+REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
+REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
+
+/** End of variable type registration */
+
+template <typename T>
+inline constexpr bool IsRegisteredVarType() {
+  return VarTypeRegistry::IsRegistered<T>();
+}
+
+#undef REG_PROTO_VAR_TYPE_TRAIT
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
new file mode 100644
index 0000000000..00840d634d
--- /dev/null
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cstdint>
+#include <iostream>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#ifdef PADDLE_WITH_CUDA
+#ifndef _WIN32
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/fluid/operators/cudnn_rnn_cache.h"
+#endif
+
+namespace paddle {
+namespace framework {
+
+template <int kPos, int kEnd, bool kStop>
+struct TypeIndexChecker {
+  template <typename SetType1, typename SetType2>
+  static void Check(SetType1 *var_id_set, SetType2 *type_index_set) {
+    using Type =
+        typename std::tuple_element<kPos, VarTypeRegistry::ArgTuple>::type;
+    static_assert(std::is_same<typename VarTypeTrait<Type>::Type, Type>::value,
+                  "Type must be the same");
+    constexpr auto kId = VarTypeTrait<Type>::kId;
+    std::type_index actual_type(typeid(Type));
+    EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name()));
+    EXPECT_EQ(ToTypeIndex(kId), actual_type);
+    EXPECT_EQ(ToTypeId(actual_type), kId);
+    EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type);
+    EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId);
+
+    EXPECT_TRUE(var_id_set->count(kId) == 0);              // NOLINT
+    EXPECT_TRUE(type_index_set->count(actual_type) == 0);  // NOLINT
+    var_id_set->insert(kId);
+    type_index_set->insert(std::type_index(typeid(Type)));
+    TypeIndexChecker<kPos + 1, kEnd, kPos + 1 == kEnd>::Check(var_id_set,
+                                                              type_index_set);
+  }
+};
+
+template <int kPos, int kEnd>
+struct TypeIndexChecker<kPos, kEnd, true> {
+  template <typename SetType1, typename SetType2>
+  static void Check(SetType1 *, SetType2 *) {}
+};
+
+TEST(var_type_traits, check_no_duplicate_registry) {
+  constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum;
+  std::unordered_set<int> var_id_set;
+  std::unordered_set<std::type_index> type_index_set;
+  TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check(
+      &var_id_set, &type_index_set);
+}
+
+template <typename T>
+bool CheckVarId(int proto_id) {
+  static_assert(std::is_same<typename VarTypeTrait<T>::Type, T>::value,
+                "Type must be the same");
+  return VarTypeTrait<T>::kId == proto_id;
+}
+
+TEST(var_type_traits, check_proto_type_id) {
+  ASSERT_TRUE(CheckVarId<LoDTensor>(proto::VarType::LOD_TENSOR));
+  ASSERT_TRUE(CheckVarId<SelectedRows>(proto::VarType::SELECTED_ROWS));
+  ASSERT_TRUE(CheckVarId<std::vector<Scope *>>(proto::VarType::STEP_SCOPES));
+  ASSERT_TRUE(CheckVarId<LoDRankTable>(proto::VarType::LOD_RANK_TABLE));
+  ASSERT_TRUE(CheckVarId<LoDTensorArray>(proto::VarType::LOD_TENSOR_ARRAY));
+  ASSERT_TRUE(CheckVarId<platform::PlaceList>(proto::VarType::PLACE_LIST));
+  ASSERT_TRUE(CheckVarId<ReaderHolder>(proto::VarType::READER));
+  ASSERT_TRUE(CheckVarId<int>(proto::VarType::INT32));
+  ASSERT_TRUE(CheckVarId<float>(proto::VarType::FP32));
+
+  ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS);
+  ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES);
+  ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE);
+  ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY,
+            proto::VarType::LOD_TENSOR_ARRAY);
+  ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST);
+  ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER);
+  ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH);
+  ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST);
+  ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW);
+  ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE);
+  ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32);
+  ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32);
+}
+
+TEST(var_type_traits, test_registry) {
+  using Registry = detail::VarTypeRegistryImpl<int8_t, int32_t, size_t, double>;
+  ASSERT_TRUE(Registry::TypePos<int8_t>() == 0);
+  ASSERT_TRUE(Registry::TypePos<int32_t>() == 1);
+  ASSERT_TRUE(Registry::TypePos<size_t>() == 2);
+  ASSERT_TRUE(Registry::TypePos<double>() == 3);
+  ASSERT_TRUE(Registry::TypePos<float>() == -1);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 873e1b20a5..b9d07da822 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -18,7 +18,7 @@
 #include <typeindex>
 #include <typeinfo>
 
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 
 namespace paddle {
 namespace framework {
@@ -27,10 +27,14 @@ class Variable {
  public:
   template <typename T>
   const T& Get() const {
+    static_assert(
+        IsRegisteredVarType<T>(),
+        "Not registered type. Please register T inside var_type_traits.h");
     PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing");
-    PADDLE_ENFORCE(IsType<T>(),
+    PADDLE_ENFORCE(holder_->Type() == VarTypeTrait<T>::kId,
                    "Variable must be type %s, the holding type is %s",
-                   typeid(T).name(), holder_->Type().name());
+                   ToTypeName(VarTypeTrait<T>::kId),
+                   ToTypeName(holder_->Type()));
     return *static_cast<const T*>(holder_->Ptr());
   }
 
@@ -39,61 +43,61 @@ class Variable {
   template <typename T>
   T* GetMutable() {
     if (!holder_) {
-      holder_.reset(new PlaceholderImpl<T>(new T()));
+      holder_.reset(new PlaceholderImpl<T>());
     } else {
-      PADDLE_ENFORCE(IsType<T>(),
+      PADDLE_ENFORCE(holder_->Type() == VarTypeTrait<T>::kId,
                      "Variable must be type %s, the holding type is %s",
-                     typeid(T).name(), holder_->Type().name());
+                     ToTypeName(VarTypeTrait<T>::kId),
+                     ToTypeName(holder_->Type()));
     }
     return static_cast<T*>(holder_->Ptr());
   }
 
   template <typename T>
   bool IsType() const {
-    return holder_ != nullptr &&
-           std::type_index(typeid(T)) == std::type_index(holder_->Type());
+    return holder_ && holder_->Type() == VarTypeTrait<T>::kId;
   }
 
   void Clear() { holder_.reset(); }
 
-  std::type_index Type() const {
+  int Type() const {
     PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
     return holder_->Type();
   }
 
  private:
   struct Placeholder {
-    virtual ~Placeholder() {}
-    virtual const std::type_info& Type() const = 0;
-    virtual void* Ptr() const = 0;
+    virtual ~Placeholder() = default;
+
+    inline int Type() const { return type_; }
+    inline const void* Ptr() const { return ptr_; }
+    inline void* Ptr() { return ptr_; }
+
+   protected:
+    inline void Init(void* p, int type) {
+      ptr_ = p;
+      type_ = type;
+    }
+
+    void* ptr_;
+    int type_;
   };
 
   // Placeholder hides type T, so it doesn't appear as a template
   // parameter of Variable.
   template <typename T>
   struct PlaceholderImpl : public Placeholder {
-    explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
-
-    virtual const std::type_info& Type() const { return type_; }
-    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
+    static_assert(
+        IsRegisteredVarType<T>(),
+        "Not registered type. Please register T inside var_type_traits.h");
+    PlaceholderImpl() { this->Init(&obj_, VarTypeTrait<T>::kId); }
 
-    std::unique_ptr<T> ptr_;
-    const std::type_info& type_;
+   private:
+    T obj_;
   };
 
-  std::unique_ptr<Placeholder>
-      holder_;  // pointers to a PlaceholderImpl object indeed.
-
-  // name_ is only meaningful with a Scope and accessible by it.
-  //
-  // NOTE: Please don't expose name_ by adding methods like
-  // Variable::Name or Scope::VarName!  A variable could have a human
-  // readable name or an auto-generated scope-unique name.  In the
-  // former case, the caller knows the name and doesn't need to access
-  // the name; in the latter case, the variable should be identified
-  // by its address but not the unreadable name.
-  friend class Scope;
-  const std::string* name_;
+  // pointers to a PlaceholderImpl object indeed.
+  std::unique_ptr<Placeholder> holder_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index 003dcfd3df..511c9c5214 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -16,27 +16,28 @@
 #include <string>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 
-TEST(Variable, GetMutable) {
-  using paddle::framework::Variable;
-
-  struct Tensor {
-    int content_;
-  };
+namespace paddle {
+namespace framework {
 
+TEST(Variable, GetMutable) {
   std::unique_ptr<Variable> v(new Variable());
 
-  Tensor* t = v->GetMutable<Tensor>();
-  t->content_ = 1234;
+  auto* t = v->GetMutable<std::string>();
+  *t = "1234";
 
-  const Tensor& tt = v->Get<Tensor>();
-  EXPECT_EQ(1234, tt.content_);
+  const auto& tt = v->Get<std::string>();
+  EXPECT_EQ("1234", tt);
 
   try {
-    v->GetMutable<std::string>();
+    v->GetMutable<Tensor>();
   } catch (std::exception& e) {
     return;
   }
   EXPECT_TRUE(false);
 }
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 342cb68ab2..9813149865 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -21,6 +21,7 @@
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -31,8 +32,14 @@ using framework::Variable;
 void AddTo(Variable* src, Variable* dst) {
   framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
   framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
-  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld",
-                 dst_tensor->numel(), src_tensor->numel());
+  // FIXME(minqiyang): loss_grad op will pass a zero grad of label
+  // ugly fix for it
+  if (src_tensor->numel() == 0) {
+    return;
+  }
+  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
+                 "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
+                 src_tensor->numel());
   float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
   const float* src_data = src_tensor->data<float>();
   for (size_t i = 0; i < src_tensor->numel(); ++i) {
@@ -42,12 +49,12 @@ void AddTo(Variable* src, Variable* dst) {
 
 class Autograd {
  public:
-  explicit Autograd(framework::Scope* scope) : scope_(scope) {}
+  Autograd() {}
 
   void RunBackward(VarBase* var) {
-    PADDLE_ENFORCE(var->pre_op_->op_desc_);
-    // TODO(panyx0718): Only create for vars that "require_grad"
-    (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
+    if (var->stop_gradient_) {
+      return;
+    }
 
     std::deque<OpBase*> ready;
     ready.push_back(var->pre_op_);
@@ -57,18 +64,25 @@ class Autograd {
     while (!ready.empty()) {
       OpBase* ready_op = ready.front();
       ready.pop_front();
-      std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
-
-      for (size_t i = 0; i < input_grads.size(); ++i) {
-        if (!input_grads[i]) continue;
-        OpBase* pre_op = ready_op->pre_ops_->at(i);
-        if (!pre_op) continue;
-
-        dep_counts[pre_op] -= 1;
-        PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
-        bool pre_op_ready = dep_counts[pre_op] == 0;
-        if (pre_op_ready) {
-          ready.push_back(pre_op);
+      std::map<std::string, std::vector<VarBase*>> input_grads =
+          ready_op->ApplyGrad();
+
+      for (auto it : input_grads) {
+        const std::vector<VarBase*>& ingrads = it.second;
+        for (size_t i = 0; i < ingrads.size(); ++i) {
+          if (!ingrads[i]) continue;
+          if (ready_op->input_vars_[it.first][i]->stop_gradient_) {
+            continue;
+          }
+          OpBase* pre_op = ready_op->pre_ops_[it.first][i];
+          if (!pre_op) continue;
+
+          dep_counts[pre_op] -= 1;
+          PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
+          bool pre_op_ready = dep_counts[pre_op] == 0;
+          if (pre_op_ready) {
+            ready.push_back(pre_op);
+          }
         }
       }
     }
@@ -85,138 +99,88 @@ class Autograd {
     while (!queue.empty()) {
       OpBase* candidate = queue.front();
       queue.pop_front();
-      for (OpBase* pre_op : *(candidate->pre_ops_)) {
-        if (!pre_op) continue;
-        if (visited.find(pre_op) == visited.end()) {
-          visited.insert(pre_op);
-          queue.push_back(pre_op);
+      for (auto it : candidate->pre_ops_) {
+        for (OpBase* pre_op : it.second) {
+          if (!pre_op) continue;
+          if (visited.find(pre_op) == visited.end()) {
+            visited.insert(pre_op);
+            queue.push_back(pre_op);
+          }
+          ret[pre_op] += 1;
         }
-        ret[pre_op] += 1;
       }
     }
-
     return ret;
   }
-
-  framework::Scope* scope_;
 };
 
-framework::Variable* CreateVariable(const std::string& name,
-                                    const framework::DDim& dim, float val,
-                                    framework::Scope* scope,
-                                    bool random_name = true) {
-  std::string varname = name;
-  if (random_name) {
-    std::mt19937 rng;
-    rng.seed(std::random_device()());
-    std::uniform_int_distribution<std::mt19937::result_type> dist6(
-        1, std::numeric_limits<int>::max());
-    int id = dist6(rng);
-    varname = string::Sprintf("%s@%d", varname, id);
-  }
-
-  VLOG(3) << "creating var " << varname;
-  framework::Variable* var = scope->Var(varname);
-  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
-
-  float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
-  std::fill(data, data + tensor->numel(), val);
-  return var;
-}
-
 framework::LoDTensor& VarBase::Grad() {
   VLOG(3) << "get var grad " << var_desc_->Name();
   return *grads_->GetMutable<framework::LoDTensor>();
 }
 
-void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
-  VLOG(3) << "apply var grad " << var_desc_->Name() << " "
-          << grad->Get<framework::LoDTensor>().data<float>()[0];
-  if (!grads_) {
-    grads_ =
-        CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
-                       var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
+std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
+  if (!grad_op_desc_) {
+    LOG(WARNING) << "op with no grad: " << op_desc_->Type();
+    return {};
   }
-  AddTo(grad, grads_);
-  VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
-          << grads_->Get<framework::LoDTensor>().data<float>()[0];
-}
-
-std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
   VLOG(3) << "op grad " << grad_op_desc_->Type();
 
-  for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
-    if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
-      // grad op inputs can be forward inputs, so not in grad_to_var.
-      continue;
-    }
-    VLOG(3) << "op grad in var " << grad_invar;
-    block_->FindRecursiveOrCreateVar(grad_invar);
-    framework::Variable* var = scope->Var(grad_invar);
-    const std::string& invar = grad_to_var_->at(grad_invar);
-    for (VarBase* varbase : *output_vars_) {
-      // Use the accumulated grads_ by sharing the input with grads_.
-      if (varbase->var_desc_->Name() == invar) {
-        var->GetMutable<framework::LoDTensor>()->ShareDataWith(
-            varbase->grads_->Get<framework::LoDTensor>());
-        break;
-      }
+  std::vector<std::unique_ptr<framework::Variable>> tmp_vars;
+  std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
+  for (auto it : grad_output_vars_) {
+    auto& outputs = grad_outputs[it.first];
+    for (size_t i = 0; i < it.second.size(); ++i) {
+      // Allocate a new variable
+      Variable* tmp_var = new framework::Variable();
+      tmp_var->GetMutable<framework::LoDTensor>();
+
+      tmp_vars.emplace_back(tmp_var);
+      outputs.push_back(tmp_var);
     }
   }
 
-  for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
-    VLOG(3) << "grad outvar " << outvar;
-    block_->FindRecursiveOrCreateVar(outvar);
-    framework::Variable* var = scope->Var(outvar);
-    if (!var->IsInitialized()) {
-      framework::VarDesc* var_desc = block_->FindVar(outvar);
-      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-        var->GetMutable<framework::LoDTensor>();
-      } else {
-        LOG(ERROR) << "tracer doesn't support yet";
-      }
-    }
-  }
-  grad_op_desc_->InferShape(*block_);
+  framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
+
+  // No need to do compile time infer shape here.
+  // grad_op_desc_->InferShape(*block_);
   grad_op_desc_->InferVarType(block_);
+
   std::unique_ptr<framework::OperatorBase> opbase =
       framework::OpRegistry::CreateOp(*grad_op_desc_);
-
-  opbase->Run(*scope, platform::CPUPlace());
-
-  // `ret` matches exactly with `input_vars_` of forward op.
-  std::vector<Variable*> ret;
-  for (size_t i = 0; i < input_vars_->size(); ++i) {
-    bool found = false;
-    VarBase* origin_var = (*input_vars_)[i];
-    for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
-      Variable* var = scope->FindVar(outvar);
-      std::string orig_var = grad_to_var_->at(outvar);
-      if (origin_var->var_desc_->Name() != orig_var) {
-        continue;
-      }
-      VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
-      origin_var->ApplyGrad(scope, var);
-      found = true;
-      ret.push_back(var);
-      // TODO(panyx0718): There might be another outvar with the same name.
-      // In that case, it doesn't matter the first one or the second one is
-      // used.
-      break;
-    }
-    if (!found) {
-      ret.push_back(nullptr);
+  framework::OperatorWithKernel* op_kernel =
+      dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
+  PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+  p.op.RuntimeInferShape(scope, place, ctx);
+  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+
+  for (auto it : grad_output_vars_) {
+    auto& outputs = grad_outputs[it.first];
+    auto& origin_outputs = it.second;
+
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      framework::Variable* orig_grad = origin_outputs[i];
+      AddTo(outputs[i], orig_grad);
     }
   }
-  return ret;
+  return input_vars_;
 }
 
-void VarBase::RunBackward(framework::Scope* scope) {
-  grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
-                          var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
-                          false);
+void VarBase::RunBackward() {
   if (!pre_op_) return;
-  Autograd(scope).RunBackward(this);
+
+  auto grads_t = grads_->GetMutable<framework::LoDTensor>();
+  float* data = grads_t->mutable_data<float>(platform::CPUPlace());
+  std::fill(data, data + grads_t->numel(), 1.0);
+
+  PADDLE_ENFORCE(
+      grads_ ==
+      pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_);
+  Autograd().RunBackward(this);
 }
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 85a71ca83d..2abda933cf 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -14,17 +14,69 @@
 
 #pragma once
 
+#include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace imperative {
 
+class PreparedOp {
+ public:
+  PreparedOp(const framework::OperatorBase& op,
+             const framework::RuntimeContext& ctx,
+             framework::OperatorWithKernel::OpKernelFunc func,
+             platform::DeviceContext* dev_ctx)
+      : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {}
+
+  static PreparedOp Prepare(const framework::RuntimeContext& ctx,
+                            const framework::OperatorWithKernel& op,
+                            const platform::Place& place) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx = pool.Get(place);
+
+    // check if op[type] has kernel registered.
+    auto& all_op_kernels = op.AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(op.Type());
+    if (kernels_iter == all_op_kernels.end()) {
+      PADDLE_THROW(
+          "There are no kernels which are registered in the %s operator.",
+          op.Type());
+    }
+
+    framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second;
+
+    auto expected_kernel_key = op.GetExpectedKernelType(
+        framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx));
+    VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+    auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+    // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+    if (kernel_iter == kernels.end() &&
+        expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) {
+      VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+      expected_kernel_key.library_type_ = framework::LibraryType::kPlain;
+      expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout;
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
+#endif
+    if (kernel_iter == kernels.end()) {
+      PADDLE_THROW("op %s does not have kernel for %s", op.Type(),
+                   KernelTypeToString(expected_kernel_key));
+    }
+    return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
+  }
+
+  const framework::OperatorBase& op;
+  const framework::RuntimeContext& ctx;
+  framework::OperatorWithKernel::OpKernelFunc func;
+  platform::DeviceContext* dev_ctx;
+};
 class OpBase;
 
 class VarBase {
@@ -33,56 +85,62 @@ class VarBase {
       : pre_op_(nullptr),
         pre_op_out_idx_(-1),
         var_desc_(nullptr),
-        var_(nullptr),
-        grads_(nullptr) {}
+        var_(new framework::Variable()),
+        grads_(new framework::Variable()),
+        stop_gradient_(false) {}
 
-  virtual ~VarBase() {}
+  explicit VarBase(bool stop_gradient)
+      : pre_op_(nullptr),
+        pre_op_out_idx_(-1),
+        var_desc_(nullptr),
+        var_(new framework::Variable()),
+        grads_(new framework::Variable()),
+        stop_gradient_(stop_gradient) {}
 
-  void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
+  virtual ~VarBase() {}
 
-  void RunBackward(framework::Scope* scope);
+  void RunBackward();
 
   framework::LoDTensor& Grad();
 
+  inline std::string GradName() const {
+    PADDLE_ENFORCE(
+        var_desc_,
+        "Couldn't get gradient variable's name, please call backward() first");
+    return string::Sprintf("%s@IGrad", var_desc_->Name());
+  }
+
   OpBase* pre_op_;
+  std::string pre_op_out_name_;
   int pre_op_out_idx_;
 
   framework::VarDesc* var_desc_;
   framework::Variable* var_;
   framework::Variable* grads_;
+
+  bool stop_gradient_;
 };
 
 class OpBase {
  public:
-  OpBase()
-      : input_vars_(new std::vector<VarBase*>()),
-        output_vars_(new std::vector<VarBase*>()),
-        pre_ops_(new std::vector<OpBase*>()),
-        pre_ops_out_idx_(new std::vector<int>()),
-        op_desc_(nullptr),
-        grad_op_desc_(nullptr) {}
+  OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {}
 
   virtual ~OpBase() {
-    delete input_vars_;
-    delete output_vars_;
-
-    delete pre_ops_;
-    delete pre_ops_out_idx_;
-
     if (grad_op_desc_) delete grad_op_desc_;
-    if (grad_to_var_) delete grad_to_var_;
   }
 
-  std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
+  std::map<std::string, std::vector<VarBase*>> ApplyGrad();
 
-  std::vector<VarBase*>* input_vars_;
-  std::vector<VarBase*>* output_vars_;
-  std::vector<OpBase*>* pre_ops_;
-  std::vector<int>* pre_ops_out_idx_;
   framework::OpDesc* op_desc_;
-
   framework::OpDesc* grad_op_desc_;
-  std::unordered_map<std::string, std::string>* grad_to_var_;
+
+  std::map<std::string, std::vector<VarBase*>> input_vars_;
+  std::map<std::string, std::vector<VarBase*>> output_vars_;
+  std::map<std::string, std::vector<OpBase*>> pre_ops_;
+  std::map<std::string, std::vector<int>> pre_ops_out_idx_;
+
+  std::map<std::string, std::vector<framework::Variable*>> grad_input_vars_;
+  std::map<std::string, std::vector<framework::Variable*>> grad_output_vars_;
   framework::BlockDesc* block_;
 };
 
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 97772dc110..c6eff86fac 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -20,7 +20,6 @@
 
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/layer.h"
 
@@ -41,22 +40,26 @@ void CreateGradOp(const framework::OpDesc& op_desc,
   *grad_op_desc = grad_op_descs[0].release();
 }
 
+void InitVar(framework::Variable* var, framework::Variable* grad_var) {
+  auto& var_t = var->Get<framework::LoDTensor>();
+  float* data =
+      grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+          var_t.dims(), platform::CPUPlace());
+  std::fill(data, data + var_t.numel(), 0.0);
+}
+
 class Tracer {
  public:
-  explicit Tracer(framework::BlockDesc* root_block,
-                  framework::BlockDesc* startup_block)
-      : root_block_(root_block), startup_block_(startup_block) {
-    root_scope_ = new framework::Scope();
-    scopes_[root_block_] = root_scope_;
-    scopes_[startup_block_] = root_scope_;
-  }
+  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
 
-  virtual ~Tracer() { delete root_scope_; }
+  virtual ~Tracer() {}
+
+  void Trace(OpBase* op,
+             const std::map<std::string, std::vector<VarBase*>>& inputs,
+             const std::map<std::string, std::vector<VarBase*>>& outputs,
+             framework::BlockDesc* block, const bool stop_gradient = false) {
+    std::map<std::string, VarBase*> vars;
 
-  void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
-             const std::vector<VarBase*>& outputs,
-             framework::BlockDesc* block) {
-    framework::Scope* scope = GetScope(block);
     framework::OpDesc* op_desc = op->op_desc_;
     VLOG(3) << "tracer tracing " << op_desc->Type();
     op_desc->InferShape(*block);
@@ -64,77 +67,113 @@ class Tracer {
     std::unique_ptr<framework::OperatorBase> op_base =
         framework::OpRegistry::CreateOp(*op_desc);
 
-    *op->input_vars_ = inputs;
-    for (VarBase* input : inputs) {
-      const std::string vname = input->var_desc_->Name();
-      framework::Variable* var = scope->Var(vname);
-      input->var_ = var;
-      if (!var->IsInitialized()) {
-        framework::VarDesc* var_desc = block->FindVar(vname);
-        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-          var->GetMutable<framework::LoDTensor>();
+    framework::VariableValueMap invars_map;
+    framework::VariableValueMap outvars_map;
+
+    op->input_vars_ = inputs;
+    for (auto it : op->input_vars_) {
+      auto& invars = invars_map[it.first];
+      for (VarBase* inp : it.second) {
+        PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr",
+                                op->op_desc_->Type(), inp->var_desc_->Name());
+
+        invars.push_back(inp->var_);
+        vars[inp->var_desc_->Name()] = inp;
+        if (inp->pre_op_) {
+          op->pre_ops_[it.first].push_back(inp->pre_op_);
+          op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
         } else {
-          LOG(ERROR) << "tracer doesn't support yet";
+          op->pre_ops_[it.first].push_back(nullptr);
         }
+        VLOG(3) << "input vname " << inp->var_desc_->Name() << " "
+                << inp->var_->IsInitialized();
       }
-      if (input->pre_op_) {
-        op->pre_ops_->push_back(input->pre_op_);
-        op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
-      } else {
-        op->pre_ops_->push_back(nullptr);
-      }
-      VLOG(3) << "input vname " << vname << " "
-              << var->Get<framework::LoDTensor>().dims().size();
     }
 
-    *op->output_vars_ = outputs;
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      const std::string vname = outputs[i]->var_desc_->Name();
-      framework::Variable* var = scope->Var(vname);
-      if (!var->IsInitialized()) {
-        framework::VarDesc* var_desc = block->FindVar(vname);
+    op->output_vars_ = outputs;
+    for (auto it : op->output_vars_) {
+      auto& outvars = outvars_map[it.first];
+      const std::vector<VarBase*>& outputs = it.second;
+      for (size_t i = 0; i < outputs.size(); ++i) {
+        VarBase* out = outputs[i];
+        outvars.push_back(out->var_);
+        vars[out->var_desc_->Name()] = out;
+
+        framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name());
         if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
-          var->GetMutable<framework::LoDTensor>();
+          out->var_->GetMutable<framework::LoDTensor>();
         } else {
           LOG(ERROR) << "tracer doesn't support yet";
         }
+        out->stop_gradient_ = stop_gradient;
+        out->pre_op_ = op;
+        out->pre_op_out_name_ = it.first;
+        out->pre_op_out_idx_ = i;
+
+        VLOG(3) << "output vname " << out->var_desc_->Name() << " "
+                << out->var_->IsInitialized();
       }
-      outputs[i]->var_ = var;
-      outputs[i]->pre_op_ = op;
-      outputs[i]->pre_op_out_idx_ = i;
     }
 
     VLOG(3) << "tracer running " << op_desc->Type();
-    op_base->Run(*scope, platform::CPUPlace());
-    if (block == startup_block_) {
-      op->grad_op_desc_ = nullptr;
-      op->grad_to_var_ = nullptr;
-    } else {
+    framework::RuntimeContext ctx(invars_map, outvars_map);
+
+    // TODO(panyx0718): Cache p.
+    framework::OperatorWithKernel* op_kernel =
+        dynamic_cast<framework::OperatorWithKernel*>(op_base.get());
+    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+
+    framework::Scope scope;
+    platform::CPUPlace place;
+    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
+    p.op.RuntimeInferShape(scope, place, ctx);
+    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+
+    if (!stop_gradient) {
       framework::OpDesc* grad_op_desc;
       auto grad_to_var = new std::unordered_map<std::string, std::string>();
       CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
       op->grad_op_desc_ = grad_op_desc;
-      op->grad_to_var_ = grad_to_var;
-    }
-    op->block_ = block;
-  }
 
-  framework::Scope* GetScope(framework::BlockDesc* block) {
-    if (scopes_.find(block) != scopes_.end()) {
-      return scopes_.at(block);
+      for (auto it : grad_op_desc->Inputs()) {
+        auto& grad_in_vars = op->grad_input_vars_[it.first];
+        for (const std::string& grad_invar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_invar);
+          auto var_it = grad_to_var->find(grad_invar);
+          if (var_it == grad_to_var->end()) {
+            auto fwd_var_it = vars.find(grad_invar);
+            PADDLE_ENFORCE(fwd_var_it != vars.end());
+            grad_in_vars.push_back(fwd_var_it->second->var_);
+          } else {
+            VarBase* var = vars[var_it->second];
+            if (!var->grads_->IsInitialized()) {
+              InitVar(var->var_, var->grads_);
+            }
+            grad_in_vars.push_back(var->grads_);
+          }
+        }
+      }
+
+      for (auto it : grad_op_desc->Outputs()) {
+        auto& grad_out_vars = op->grad_output_vars_[it.first];
+        for (const std::string& grad_outvar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_outvar);
+          auto var_it = grad_to_var->find(grad_outvar);
+          PADDLE_ENFORCE(var_it != grad_to_var->end());
+          VarBase* var = vars[var_it->second];
+          if (!var->grads_->IsInitialized()) {
+            InitVar(var->var_, var->grads_);
+          }
+          grad_out_vars.push_back(var->grads_);
+        }
+      }
     }
-    framework::BlockDesc* parent_block = block->ParentBlock();
-    PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end());
-    framework::Scope* scope = &scopes_[parent_block]->NewScope();
-    scopes_[block] = scope;
-    return scope;
+
+    op->block_ = block;
   }
 
  private:
-  std::map<framework::BlockDesc*, framework::Scope*> scopes_;
   framework::BlockDesc* root_block_;
-  framework::BlockDesc* startup_block_;
-  framework::Scope* root_scope_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index cb88333d15..f84e1ab6b8 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -69,17 +69,17 @@ void TestWord2vecPrediction(const std::string& model_path) {
   std::vector<PaddleTensor> outputs;
   CHECK(predictor->Run(slots, &outputs));
 
-  PADDLE_ENFORCE(outputs.size(), 1UL);
+  PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
   // Check the output buffer size and result of each tid.
-  PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL);
   float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                      0.000932706};
   const size_t num_elements = outputs.front().data.length() / sizeof(float);
   // The outputs' buffers are in CPU memory.
   for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements);
        i++) {
-    LOG(INFO) << "data: "
-              << static_cast<float*>(outputs.front().data.data())[i];
+    LOG(INFO) << "data: " << static_cast<float*>(outputs.front().data.data())[i]
+              << " result: " << result[i];
     PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
                    result[i]);
   }
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 83d411eecf..2d8980b1d1 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -123,10 +123,9 @@ struct Argument {
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
   DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
-  DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
-                      std::function<bool(const framework::ir::Node*)>);
   DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
+  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
 
   // The program transformed by IR analysis phase.
   DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 51bca8039d..e37fea38bc 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument,
   for (const std::string &pass_name : passes) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
 
-    // Set some pass attributes.
-    if (pass_name == "ir_analysis_pass") {
-      pass->Set("tensorrt_node_teller",
-                new SubgraphDetector::NodeInsideSubgraphTeller(
-                    argument->tensorrt_node_teller()));
-    }
-
     if (pass_name == "graph_viz_pass") {
       std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
                                   (pre_pass.empty() ? "origin" : pre_pass) +
@@ -70,11 +63,10 @@ void IRPassManager::CreatePasses(Argument *argument,
     }
 
     if (pass_name == "tensorrt_subgraph_pass") {
-      PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
-      pass->SetNotOwned("tensorrt_node_teller",
-                        argument->tensorrt_node_teller_ptr());
       pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
       pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
+      pass->Set("min_subgraph_size",
+                new int(argument->tensorrt_min_subgraph_size()));
     }
 
     // graph_ = pass->Apply(std::move(graph_));
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index 822c7799bb..9ae5b8aa17 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -1,9 +1,13 @@
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
-cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector)
-set(analysis_deps ${analysis_deps}
-        subgraph_detector tensorrt_subgraph_pass
-        CACHE INTERNAL "")
 
-set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
-file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
-set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
+if (TENSORRT_FOUND)
+  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
+
+  set(analysis_deps ${analysis_deps}
+          subgraph_detector tensorrt_subgraph_pass
+          CACHE INTERNAL "")
+
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+  file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
+endif()
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 9c42b83e7a..bc06e78ae6 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
+#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/tensorrt/op_teller.h"
 
 namespace paddle {
 namespace inference {
@@ -33,10 +36,13 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
     std::unique_ptr<framework::ir::Graph> graph) const {
   framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
 
-  auto teller =
-      Get<SubgraphDetector::NodeInsideSubgraphTeller>("tensorrt_node_teller");
+  auto teller = [](const framework::ir::Node *node) {
+    if (!node->IsOp() || !node->Op()) return false;
+    return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
+  };
 
-  SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/);
+  SubGraphFuser fuser(graph.get(), teller,
+                      Get<int>("min_subgraph_size") /*min subgraph size*/);
   fuser();
 
   for (auto *node : graph->Nodes()) {
@@ -197,10 +203,26 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
 
 std::vector<std::string> ExtractParameters(
     const std::unordered_set<Node *> &nodes) {
+  // We can judge whether a variable is a parameter by
+  // its presistable property, but sometimes the presistable
+  // of the feed op output is true, so we have to identify it.
+  std::vector<std::string> feed_outputs;
+  for (const auto &node : nodes) {
+    if (!node->IsOp()) continue;
+    std::string op_type = node->Op()->Type();
+    if (op_type == "feed") {
+      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
+      std::copy(output_names.begin(), output_names.end(),
+                std::back_inserter(feed_outputs));
+    }
+  }
+
   std::vector<std::string> parameters;
   for (const auto &node : nodes) {
     if (!node->IsVar()) continue;
-    if (node->Var()->Persistable()) {
+    if (node->Var()->Persistable() &&
+        std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) ==
+            feed_outputs.end()) {
       parameters.push_back(node->Name());
     }
   }
@@ -213,6 +235,6 @@ std::vector<std::string> ExtractParameters(
 
 REGISTER_PASS(tensorrt_subgraph_pass,
               paddle::inference::analysis::TensorRtSubgraphPass)
-    .RequirePassAttr("tensorrt_node_teller")
     .RequirePassAttr("max_batch_size")
-    .RequirePassAttr("workspace_size");
+    .RequirePassAttr("workspace_size")
+    .RequirePassAttr("min_subgraph_size");
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
index c3a2b3ca1d..490189e550 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -27,9 +27,6 @@ namespace analysis {
 
 void IrAnalysisComposePass::RunImpl(Argument *argument) {
   ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
-  if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
-    InitTensorRTAttrs(argument);
-  }
   ApplyIrPasses(argument);
   CollectFusionStatis(argument);
 }
@@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const {
   return "ir-analysis-compose-pass";
 }
 
-void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
-  if (argument->use_tensorrt_valid() && argument->use_tensorrt()) {
-    LOG(INFO) << "Initing TensorRT pass";
-    argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) {
-      std::unordered_set<std::string> teller_set(
-          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
-           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-           "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
-           "conv2d_transpose", "leaky_relu"});
-      if (!node->IsOp()) return false;
-
-      if (teller_set.count(node->Op()->Type())) {
-        return true;
-      } else {
-        return false;
-      }
-    });
-  }
-}
-
 void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
   std::vector<std::string> passes({
       "ir_graph_build_pass", "ir_analysis_pass",
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
index 53e2ebb003..16c6b7d84d 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
@@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass {
   std::string repr() const override;
 
  private:
-  void InitTensorRTAttrs(Argument* argument);
-
   void ApplyIrPasses(Argument* argument);
 
   void CollectFusionStatis(Argument* argument);
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index dcefdd92f5..211c691504 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -14,84 +14,101 @@
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle_pass_builder.h"  // NOLINT
+#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 
 PassStrategy *contrib::AnalysisConfig::pass_builder() const {
-  PADDLE_ENFORCE(
-      pass_builder_.get(),
-      "Should call constructor first, that will init the pass_builder_.");
+  if (!pass_builder_.get()) {
+    if (use_gpu_) {
+      LOG(INFO) << "Create GPU IR passes";
+      pass_builder_.reset(new GpuPassStrategy);
+    } else {
+      LOG(INFO) << "Create CPU IR passes";
+      pass_builder_.reset(new CpuPassStrategy);
+    }
+  } else if (pass_builder_->use_gpu() ^ use_gpu()) {
+    LOG(WARNING) << "The use_gpu flag is not compatible between Config and "
+                    "PassBuilder, the flags are "
+                 << use_gpu() << " " << pass_builder_->use_gpu();
+    LOG(WARNING) << "Please make them compatible, still use the existing "
+                    "PassBuilder.";
+  }
+
   return pass_builder_.get();
 }
 
-contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) {
-  this->use_gpu = use_gpu;
-  if (use_gpu) {
-    pass_builder_.reset(new GpuPassStrategy);
-  } else {
-    pass_builder_.reset(new CpuPassStrategy);
-  }
+contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
+  model_dir_ = model_dir;
+}
+contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
+                                        const std::string &params_file) {
+  prog_file_ = prog_file;
+  params_file_ = params_file;
+}
+void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
+                                       const std::string &params_file_path) {
+  prog_file_ = prog_file_path;
+  params_file_ = params_file_path;
+}
+void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
+                                           int device_id) {
+#ifdef PADDLE_WITH_CUDA
+  use_gpu_ = true;
+  memory_pool_init_size_mb_ = memory_pool_init_size_mb;
+  device_id_ = device_id;
+#else
+  LOG(ERROR) << "Please compile with gpu to EnableGpu";
+  use_gpu_ = false;
+#endif
 }
+void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
 
 contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
-  // fields from Config
-  model_dir = other.model_dir;
-  // fields from NativeConfig
-  use_gpu = other.use_gpu;
-  device = other.device;
-  fraction_of_gpu_memory = other.fraction_of_gpu_memory;
-  prog_file = other.prog_file;
-  param_file = other.param_file;
-  specify_input_name = other.specify_input_name;
-  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
-  // fields from this.
-  enable_ir_optim = other.enable_ir_optim;
-  // For mkldnn
-  use_mkldnn_ = other.use_mkldnn_;
-  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
-
-  use_feed_fetch_ops = other.use_feed_fetch_ops;
-  use_tensorrt_ = other.use_tensorrt_;
-  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
-  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
-  model_from_memory_ = other.model_from_memory_;
-
-  if (use_gpu) {
+#define CP_MEMBER(member__) member__ = other.member__;
+
+  // Model related.
+  CP_MEMBER(model_dir_);
+  CP_MEMBER(prog_file_);
+  CP_MEMBER(params_file_);
+  CP_MEMBER(model_from_memory_);  // the memory model reuses prog_file_ and
+                                  // params_file_ fields.
+  // Gpu releated.
+  CP_MEMBER(use_gpu_);
+  CP_MEMBER(device_id_);
+  CP_MEMBER(memory_pool_init_size_mb_);
+  // TensorRT releated.
+  CP_MEMBER(use_tensorrt_);
+  CP_MEMBER(tensorrt_workspace_size_);
+  CP_MEMBER(tensorrt_max_batchsize_);
+  CP_MEMBER(tensorrt_min_subgraph_size_);
+  // MKLDNN releated.
+  CP_MEMBER(use_mkldnn_);
+  CP_MEMBER(mkldnn_enabled_op_types_);
+
+  // Ir related.
+  CP_MEMBER(enable_ir_optim_);
+  CP_MEMBER(use_feed_fetch_ops_);
+  CP_MEMBER(ir_debug_);
+  CP_MEMBER(specify_input_name_);
+
+  CP_MEMBER(cpu_math_library_num_threads_);
+
+  CP_MEMBER(serialized_info_cache_);
+
+  if (use_gpu_) {
     pass_builder_.reset(new GpuPassStrategy(
         *static_cast<GpuPassStrategy *>(other.pass_builder())));
   } else {
     pass_builder_.reset(new CpuPassStrategy(
         *static_cast<CpuPassStrategy *>(other.pass_builder())));
   }
-}
 
-contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
-  // fields from Config
-  model_dir = other.model_dir;
-  // fields from NativeConfig
-  use_gpu = other.use_gpu;
-  device = other.device;
-  fraction_of_gpu_memory = other.fraction_of_gpu_memory;
-  prog_file = other.prog_file;
-  param_file = other.param_file;
-  specify_input_name = other.specify_input_name;
-  cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
-  // fields from this.
-  enable_ir_optim = other.enable_ir_optim;
-  // For mkldnn
-  use_mkldnn_ = other.use_mkldnn_;
-  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
-
-  use_feed_fetch_ops = other.use_feed_fetch_ops;
-  use_tensorrt_ = other.use_tensorrt_;
-  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
-  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
-  model_from_memory_ = other.model_from_memory_;
-
-  pass_builder_ = std::move(other.pass_builder_);
+#undef CP_MEMBER
 }
 
 void contrib::AnalysisConfig::EnableMKLDNN() {
@@ -105,20 +122,95 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
 }
 
 void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
-                                                   int max_batch_size) {
+                                                   int max_batch_size,
+                                                   int min_subgraph_size) {
   use_tensorrt_ = true;
   tensorrt_workspace_size_ = workspace_size;
   tensorrt_max_batchsize_ = max_batch_size;
-  // Append after the infer_clean pass.
-  pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
+}
+
+void contrib::AnalysisConfig::Update() {
+  auto info = SerializeInfoCache();
+  if (info == serialized_info_cache_) return;
+
+  if (use_gpu_) {
+    pass_builder_.reset(new GpuPassStrategy);
+  } else {
+    pass_builder_.reset(new CpuPassStrategy);
+  }
+
+  if (use_tensorrt_) {
+    if (!use_gpu_) {
+      LOG(ERROR)
+          << "TensorRT engine is not available when EnableGpu() not actived.";
+    } else {
+      // Append after the infer_clean pass.
+      pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
+    }
+  }
+
+  if (use_mkldnn_) {
+    if (!enable_ir_optim_) {
+      LOG(ERROR)
+          << "EnableMKLDNN() only works when IR optimization is enabled.";
+    }
+#ifdef PADDLE_WITH_MKLDNN
+    pass_builder()->EnableMKLDNN();
+    use_mkldnn_ = true;
+#else
+    LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
+    use_mkldnn_ = false;
+#endif
+  }
+
+  if (ir_debug_) {
+    pass_builder()->TurnOnDebug();
+  }
+}
+
+std::string contrib::AnalysisConfig::SerializeInfoCache() {
+  std::stringstream ss;
+  ss << use_gpu_;
+  ss << memory_pool_init_size_mb_;
+
+  ss << use_tensorrt_;
+  ss << tensorrt_workspace_size_;
+  ss << tensorrt_max_batchsize_;
+
+  ss << use_mkldnn_;
+  ss << enable_ir_optim_;
+  ss << use_feed_fetch_ops_;
+  ss << ir_debug_;
+
+  return ss.str();
+}
+
+void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
+    int cpu_math_library_num_threads) {
+  cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+}
+
+float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
+#ifdef PADDLE_WITH_CUDA
+  // Get the GPU memory details and calculate the fraction of memory for the
+  // GPU memory pool.
+  size_t gpu_used, gpu_available;
+  platform::GpuMemoryUsage(&gpu_used, &gpu_available);
+  double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.;
+  float fraction_of_gpu_memory =
+      static_cast<double>(memory_pool_init_size_mb()) / total_gpu_memory;
+  return fraction_of_gpu_memory;
+#else
+  return 0.;
+#endif
 }
 
 void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
                                              size_t prog_buffer_size,
                                              const char *param_buffer,
                                              size_t param_buffer_size) {
-  prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size);
-  param_file = std::string(param_buffer, param_buffer + param_buffer_size);
+  prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
+  params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
   model_from_memory_ = true;
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3937884ce4..585634fae9 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -33,6 +33,7 @@
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(profile);
@@ -59,8 +60,8 @@ bool AnalysisPredictor::Init(
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
     LOG(INFO) << "You can turn off by set gflags '-profile false'";
-    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
-                                           : platform::ProfilerState::kCPU;
+    auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll
+                                             : platform::ProfilerState::kCPU;
     platform::EnableProfiler(tracking_device);
   }
 
@@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram(
     // Optimize the program, and load parameters and modify them in the
     // scope_.
     // This will change the scope_ address.
-    if (config_.enable_ir_optim) {
+    if (config_.ir_optim()) {
       status_ir_optim_enabled_ = true;
       OptimizeInferenceProgram();
     } else {
@@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram(
   return true;
 }
 bool AnalysisPredictor::CreateExecutor() {
-  if (config_.use_gpu) {
+  if (config_.use_gpu_) {
     status_use_gpu_ = true;
-    place_ = paddle::platform::CUDAPlace(config_.device);
+    place_ = paddle::platform::CUDAPlace(config_.device_id_);
   } else {
     place_ = paddle::platform::CPUPlace();
   }
@@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() {
 }
 bool AnalysisPredictor::PrepareExecutor() {
   executor_->Prepare(sub_scope_, *inference_program_, 0,
-                     config_.use_feed_fetch_ops);
+                     config_.use_feed_fetch_ops_);
 
   PADDLE_ENFORCE_NOT_NULL(sub_scope_);
 
@@ -250,8 +251,13 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     }
     input.set_lod(lod);
     int idx = -1;
-    if (config_.specify_input_name) {
-      idx = feed_names_[inputs[i].name];
+    if (config_.specify_input_name_) {
+      auto name = inputs[i].name;
+      if (feed_names_.find(name) == feed_names_.end()) {
+        LOG(ERROR) << "feed names from program do not have name: [" << name
+                   << "] from specified input";
+      }
+      idx = feed_names_[name];
     } else {
       idx = boost::get<int>(feeds_[i]->GetAttr("col"));
     }
@@ -309,25 +315,26 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 void AnalysisPredictor::OptimizeInferenceProgram() {
   status_program_optimized_ = true;
 
-  argument_.SetUseGPU(config_.use_gpu);
-  argument_.SetGPUDeviceId(config_.device);
+  argument_.SetUseGPU(config_.use_gpu());
+  argument_.SetGPUDeviceId(config_.gpu_device_id());
   argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
-  if (!config_.model_dir.empty()) {
-    argument_.SetModelDir(config_.model_dir);
+  if (!config_.model_dir().empty()) {
+    argument_.SetModelDir(config_.model_dir());
   } else {
     PADDLE_ENFORCE(
-        !config_.param_file.empty(),
+        !config_.params_file().empty(),
         "Either model_dir or (param_file, prog_file) should be set.");
-    PADDLE_ENFORCE(!config_.prog_file.empty());
-    argument_.SetModelProgramPath(config_.prog_file);
-    argument_.SetModelParamsPath(config_.param_file);
+    PADDLE_ENFORCE(!config_.prog_file().empty());
+    argument_.SetModelProgramPath(config_.prog_file());
+    argument_.SetModelParamsPath(config_.params_file());
   }
 
-  if (config_.use_gpu && config_.use_tensorrt_) {
+  if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
     argument_.SetUseTensorRT(true);
     argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
     argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
+    argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
   }
 
   if (config_.use_mkldnn_) {
@@ -335,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   }
 
   auto passes = config_.pass_builder()->AllPasses();
-  if (!config_.enable_ir_optim) passes.clear();
+  if (!config_.ir_optim()) passes.clear();
   argument_.SetIrAnalysisPasses(passes);
   argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
   Analyzer().Run(&argument_);
@@ -352,18 +359,26 @@ template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
   VLOG(3) << "create AnalysisConfig";
-  if (config.use_gpu) {
+  if (config.use_gpu()) {
     // 1. GPU memeroy
-    PADDLE_ENFORCE_GT(
-        config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
+    PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
+                      config.gpu_device_id());
     std::vector<std::string> flags;
-    if (config.fraction_of_gpu_memory >= 0.0f ||
-        config.fraction_of_gpu_memory <= 0.95f) {
+
+    float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
+    if (fraction_of_gpu_memory > 0.95f) {
+      LOG(ERROR)
+          << "Allocate too much memory for the GPU memory pool, assigned "
+          << config.memory_pool_init_size_mb() << " MB";
+      LOG(ERROR)
+          << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
+    }
+
+    if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
       flags.push_back("dummpy");
       std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         std::to_string(config.fraction_of_gpu_memory);
+                         std::to_string(fraction_of_gpu_memory);
       flags.push_back(flag);
       VLOG(3) << "set flag: " << flag;
       framework::InitGflags(flags);
@@ -437,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() {
 bool AnalysisPredictor::LoadProgramDesc() {
   // Initialize the inference program
   std::string filename;
-  if (!config_.model_dir.empty()) {
-    filename = config_.model_dir + "/__model__";
-  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+  if (!config_.model_dir().empty()) {
+    filename = config_.model_dir() + "/__model__";
+  } else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
     // All parameters are saved in a single file.
     // The file names should be consistent with that used
     // in Python API `fluid.io.save_inference_model`.
-    filename = config_.prog_file;
+    filename = config_.prog_file();
   } else {
-    if (config_.model_dir.empty() && config_.prog_file.empty()) {
+    if (config_.model_dir().empty() && config_.prog_file().empty()) {
       LOG(ERROR)
           << "Either model_dir or (prog_file, param_file) should be set.";
       return false;
     }
     LOG(ERROR) << string::Sprintf(
-        "not valid model path '%s' or program path '%s'.", config_.model_dir,
-        config_.param_file);
+        "not valid model path '%s' or program path '%s'.", config_.model_dir(),
+        config_.params_file());
     return false;
   }
 
@@ -472,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
 
     proto.ParseFromString(pb_content);
   } else {
-    proto.ParseFromString(config_.prog_file);
+    proto.ParseFromString(config_.prog_file());
   }
   inference_program_.reset(new framework::ProgramDesc(proto));
   return true;
@@ -502,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() {
       new_var->SetLoDLevel(var->GetLoDLevel());
       new_var->SetPersistable(true);
 
-      if (!config_.param_file.empty()) {
+      if (!config_.params_file().empty()) {
         params.push_back(new_var->Name());
       } else {
         // append_op
         framework::OpDesc *op = load_block->AppendOp();
         op->SetType("load");
         op->SetOutput("Out", {new_var->Name()});
-        op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()});
+        op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()});
         op->CheckAttrs();
       }
     }
   }
 
-  if (!config_.param_file.empty()) {
+  if (!config_.params_file().empty()) {
     // sort paramlist to have consistent ordering
     std::sort(params.begin(), params.end());
     // append just the load_combine op
     framework::OpDesc *op = load_block->AppendOp();
     op->SetType("load_combine");
     op->SetOutput("Out", params);
-    op->SetAttr("file_path", {config_.param_file});
+    op->SetAttr("file_path", {config_.params_file()});
     op->CheckAttrs();
   }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 12ecb7c15e..a6e126c5d5 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -35,8 +35,11 @@ using framework::proto::ProgramDesc;
 using framework::NaiveExecutor;
 using contrib::AnalysisConfig;
 
-/* This predictor is based on the original native predictor with IR and Analysis
- * support. It will optimize IR and Parameters in the runtime.
+/** \brief This predictor is based on the original native predictor with IR and
+ * Analysis support.
+ *
+ * It will optimize IR and Parameters in the runtime.
+ *
  * TODO(Superjomn) Replace the Navive predictor?
  */
 class AnalysisPredictor : public PaddlePredictor {
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index a361b34437..6169e60541 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -25,9 +25,9 @@ namespace paddle {
 using contrib::AnalysisConfig;
 
 TEST(AnalysisPredictor, analysis_off) {
-  AnalysisConfig config(false);
-  config.model_dir = FLAGS_dirname;
-  config.enable_ir_optim = false;
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim(false);
 
   auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
   auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
@@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) {
 }
 
 TEST(AnalysisPredictor, analysis_on) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim(true);
 #ifdef PADDLE_WITH_CUDA
-  AnalysisConfig config(true);
-  config.fraction_of_gpu_memory = 0.15;
+  config.EnableUseGpu(100, 0);
 #else
-  AnalysisConfig config;
+  config.DisableGpu();
 #endif
-  config.model_dir = FLAGS_dirname;
-  config.enable_ir_optim = true;
 
   auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
   auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
@@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) {
   }
 
   // compare with NativePredictor
-  auto naive_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  auto naive_predictor =
+      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
   std::vector<PaddleTensor> naive_outputs;
   ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs));
   ASSERT_EQ(naive_outputs.size(), 1UL);
@@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) {
 
 TEST(AnalysisPredictor, ZeroCopy) {
   AnalysisConfig config;
-  config.model_dir = FLAGS_dirname;
-  config.use_feed_fetch_ops = false;
-
+  config.SetModel(FLAGS_dirname);
+  config.SwitchUseFeedFetchOps(false);
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
   auto w0 = predictor->GetInputTensor("firstw");
@@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) {
 
 TEST(AnalysisPredictor, Clone) {
   AnalysisConfig config;
-  config.model_dir = FLAGS_dirname;
-  config.use_feed_fetch_ops = true;
-  config.enable_ir_optim = true;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchUseFeedFetchOps(true);
+  config.SwitchIrOptim(true);
 
   std::vector<std::unique_ptr<PaddlePredictor>> predictors;
   predictors.emplace_back(CreatePaddlePredictor(config));
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index 6a8b81cc57..e14d93de2c 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -19,8 +19,6 @@ limitations under the License. */
 
 #pragma once
 
-#define WITH_ANAKIN
-
 #include <vector>
 
 #include "framework/core/net/net.h"
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 102147a493..85e250aaaf 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -288,7 +288,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memeroy
-    PADDLE_ENFORCE_GT(
+    PADDLE_ENFORCE_GE(
         config.fraction_of_gpu_memory, 0.f,
         "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
     PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index c1fcd198cc..d2133bd467 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 7839639739..54895679ca 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) {
 #endif
 
 TEST(PassBuilder, Delete) {
-  contrib::AnalysisConfig config(false);
+  contrib::AnalysisConfig config;
+  config.DisableGpu();
   config.pass_builder()->DeletePass("attention_lstm_fuse_pass");
   const auto& passes = config.pass_builder()->AllPasses();
   auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass");
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index 61ecd7bce6..30215e480f 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -36,12 +36,11 @@ namespace demo {
  */
 void Main() {
   std::unique_ptr<PaddlePredictor> predictor;
-  paddle::contrib::AnalysisConfig config(true);
-  config.param_file = FLAGS_modeldir + "/__params__";
-  config.prog_file = FLAGS_modeldir + "/__model__";
-  config.device = 0;
+  paddle::contrib::AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(FLAGS_modeldir + "/__params__",
+                  FLAGS_modeldir + "/__model__");
   config.EnableTensorRtEngine();
-  config.fraction_of_gpu_memory = 0.1;  // set by yourself
   predictor = CreatePaddlePredictor(config);
 
   VLOG(3) << "begin to process data";
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index bc8891455d..5320992b7e 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -40,15 +40,14 @@ using contrib::AnalysisConfig;
  */
 void Main(bool use_gpu) {
   std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
-  AnalysisConfig config(use_gpu);
-  config.param_file = FLAGS_modeldir + "/__params__";
-  config.prog_file = FLAGS_modeldir + "/__model__";
-  config.device = 0;
-  if (FLAGS_use_gpu) {
-    config.fraction_of_gpu_memory = 0.1;  // set by yourself
+  AnalysisConfig config;
+  if (use_gpu) {
+    config.EnableUseGpu(100, 0);
   }
+  config.SetModel(FLAGS_modeldir + "/__model__",
+                  FLAGS_modeldir + "/__params__");
 
-  predictor = CreatePaddlePredictor<NativeConfig>(config);
+  predictor = CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
   analysis_predictor = CreatePaddlePredictor(config);
 
   // Just a single batch of data.
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
index 569a487328..03c2aa3fb8 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -25,7 +25,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
       // TODO(Superjomn) should avoid the case when a TensorArray is a
       // parameter.
       if (var_name == "feed" || var_name == "fetch") continue;
-      if (var->Type() == typeid(framework::LoDTensorArray)) {
+      if (var->IsType<framework::LoDTensorArray>()) {
         VLOG(4) << "collect " << var_name;
         arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
       }
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index 6a5ea64de6..213c6891d0 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -27,8 +27,11 @@ namespace details {
 // training phase.
 struct TensorArrayBatchCleaner {
   TensorArrayBatchCleaner() {
-    valid_types_.insert(typeid(framework::Tensor));
-    valid_types_.insert(typeid(framework::LoDTensor));
+    constexpr auto kTensorId = framework::VarTypeTrait<framework::Tensor>::kId;
+    constexpr auto kLoDTensorId =
+        framework::VarTypeTrait<framework::LoDTensor>::kId;
+    valid_types_.insert(kTensorId);
+    valid_types_.insert(kLoDTensorId);
   }
   // Collect the variables that are not Tensor or LoDTensor, and reset them to a
   // bool(trick), because some of them are containers, and some operators just
@@ -46,7 +49,7 @@ struct TensorArrayBatchCleaner {
   bool no_tensor_flag_{true};
   std::vector<framework::LoDTensorArray *> arrays_;
 
-  std::unordered_set<std::type_index> valid_types_;
+  std::unordered_set<int> valid_types_;
   std::unordered_set<framework::Variable *> no_tensor_vars_;
 };
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 9a393a61c4..7830e85956 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -113,6 +113,16 @@ static void TensorAssignData(PaddleTensor *tensor,
   }
 }
 
+template <typename T>
+static void TensorAssignData(PaddleTensor *tensor,
+                             const std::vector<std::vector<T>> &data,
+                             const std::vector<size_t> &lod) {
+  int size = lod[lod.size() - 1];
+  tensor->shape.assign({size, 1});
+  tensor->lod.assign({lod});
+  TensorAssignData(tensor, data);
+}
+
 template <typename T>
 static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
                                     const std::vector<std::vector<T>> &data) {
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index f05b9832da..ae6ac69854 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -19,6 +19,8 @@
 #include <unordered_set>
 #include <vector>
 
+/*! \file */
+
 // Here we include some header files with relative paths, for that in deploy,
 // the abstract path of this header file will be changed.
 #include "paddle_api.h"           // NOLINT
@@ -34,54 +36,219 @@ class AnalysisPredictor;
 namespace contrib {
 
 // NOTE WIP, not stable yet.
-struct AnalysisConfig : public NativeConfig {
-  explicit AnalysisConfig(bool use_gpu = false);
+struct AnalysisConfig {
+  AnalysisConfig() = default;
   explicit AnalysisConfig(const AnalysisConfig& other);
-  explicit AnalysisConfig(AnalysisConfig&& other);
+  explicit AnalysisConfig(const std::string& model_dir);
+  explicit AnalysisConfig(const std::string& prog_file,
+                          const std::string& params_file);
 
-  // Determine whether to perform graph optimization.
-  bool enable_ir_optim = true;
+  /** Set model with a directory.
+   */
+  void SetModel(const std::string& model_dir) { model_dir_ = model_dir; }
+  /** Set model with two specific pathes for program and parameters.
+   */
+  void SetModel(const std::string& prog_file_path,
+                const std::string& params_file_path);
+  /** Set program file path.
+   */
+  void SetProgFile(const std::string& x) { prog_file_ = x; }
+  /** Set parameter composed file path.
+   */
+  void SetParamsFile(const std::string& x) { params_file_ = x; }
+  /** Get the model directory path.
+   */
+  const std::string& model_dir() const { return model_dir_; }
+  /** Get the program file path.
+   */
+  const std::string& prog_file() const { return prog_file_; }
+  /** Get the composed parameters file.
+   */
+  const std::string& params_file() const { return params_file_; }
 
-  // Get a pass builder for customize the passes in IR analysis phase.
-  PassStrategy* pass_builder() const;
+  // GPU related.
+
+  /**
+   * \brief Turn on GPU.
+   * @param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
+   * @param device_id the GPU card to use (default is 0).
+   */
+  void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
+  /** Turn off the GPU.
+   */
+  void DisableGpu();
+  /** A bool state telling whether the GPU is turned on.
+   */
+  bool use_gpu() const { return use_gpu_; }
+  /** Get the GPU device id.
+   */
+  int gpu_device_id() const { return device_id_; }
+  /** Get the initial size in MB of the GPU memory pool.
+   */
+  int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; }
+  /** Get the proportion of the initial memory pool size compared to the device.
+   */
+  float fraction_of_gpu_memory_for_pool() const;
+
+  /** \brief Control whether to perform IR graph optimization.
+   *
+   * If turned off, the AnalysisConfig will act just like a NativeConfig.
+   */
+  void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; }
+  /** A boolean state tell whether the ir graph optimization is actived.
+   */
+  bool ir_optim() const { return enable_ir_optim_; }
 
-  // NOT stable yet.
-  bool use_feed_fetch_ops{true};
+  /** \brief INTERNAL Determine whether to use the feed and fetch operators.
+   * Just for internal development, not stable yet.
+   * When ZeroCopyTensor is used, this should turned off.
+   */
+  void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; }
+  /** A boolean state telling whether to use the feed and fetch operators.
+   */
+  bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; }
 
+  /** \brief Control whether to specify the inputs' names.
+   *
+   * The PaddleTensor type has a `name` member, assign it with the corresponding
+   * variable name. This is used only when the input PaddleTensors passed to the
+   * `PaddlePredictor.Run(...)` cannot follow the order in the training phase.
+   */
+  void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; }
+
+  /** A boolean state tell whether the input PaddleTensor names specified should
+   * be used to reorder the inputs in `PaddlePredictor.Run(...)`.
+   */
+  bool specify_input_name() const { return specify_input_name_; }
+
+  /**
+   * \brief Turn on the TensorRT engine.
+   *
+   * The TensorRT engine will accelerate some subgraphes in the original Fluid
+   * computation graph. In some models such as TensorRT50, GoogleNet and so on,
+   * it gains significant performance acceleration.
+   *
+   * @param workspace_size the memory size(in byte) used for TensorRT workspace.
+   * @param max_batch_size the maximum batch size of this prediction task,
+   * better set as small as possible, or performance loss.
+   * @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a
+   * subgraph is less than this, it will not transfer to TensorRT engine.
+   */
   void EnableTensorRtEngine(int workspace_size = 1 << 20,
-                            int max_batch_size = 1);
-  bool use_tensorrt() const { return use_tensorrt_; }
+                            int max_batch_size = 1, int min_subgraph_size = 3);
+  /** A boolean state telling whether the TensorRT engine is used.
+   */
+  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
 
+  /** Control whther to debug IR graph analysis phase.
+   */
+  void SwitchIrDebug(int x = true) { ir_debug_ = x; }
+
+  /** Turn on MKLDNN.
+   */
   void EnableMKLDNN();
-  bool use_mkldnn() const { return use_mkldnn_; }
+  /** A boolean state telling whether to use the MKLDNN.
+   */
+  bool mkldnn_enabled() const { return use_mkldnn_; }
+
+  /** Set and get the number of cpu math library threads.
+   */
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads);
+  /** An int state telling how many threads are used in the CPU math library.
+   */
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
+  }
+
+  /** Transform the AnalysisConfig to NativeConfig.
+   */
+  NativeConfig ToNativeConfig() const {
+    NativeConfig config;
+    config.model_dir = model_dir_;
+    config.prog_file = prog_file_;
+    config.param_file = params_file_;
+    config.use_gpu = use_gpu_;
+    config.device = device_id_;
+    config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
+    config.specify_input_name = specify_input_name_;
+    return config;
+  }
+  /** Specify the operator type list to use MKLDNN acceleration.
+   * @param op_list the operator type list.
+   */
   void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
     mkldnn_enabled_op_types_ = op_list;
   }
 
-  // Specify the memory buffer of program and parameter
+  /** Specify the memory buffer of program and parameter
+   * @param prog_buffer the memory buffer of program.
+   * @param prog_buffer_size the size of the data.
+   * @param params_buffer the memory buffer of the composed parameters file.
+   * @param params_buffer_size the size of the commposed parameters data.
+   */
   void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
-                      const char* program_buffer, size_t program_buffer_size);
+                      const char* params_buffer, size_t params_buffer_size);
+  /** A boolean state telling whether the model is set from the CPU memory.
+   */
   bool model_from_memory() const { return model_from_memory_; }
 
   friend class ::paddle::AnalysisPredictor;
 
+  /** NOTE just for developer, not an official API, easily to be broken.
+   * Get a pass builder for customize the passes in IR analysis phase.
+   */
+  PassStrategy* pass_builder() const;
+
  protected:
+  // Update the config.
+  void Update();
+
+  std::string SerializeInfoCache();
+
+ protected:
+  // Model pathes.
+  std::string model_dir_;
+  std::string prog_file_;
+  std::string params_file_;
+
+  // GPU releated.
+  bool use_gpu_{false};
+  int device_id_{0};
+  uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
+
+  // TensorRT releated.
   bool use_tensorrt_{false};
-  bool use_mkldnn_{false};
-  std::unordered_set<std::string> mkldnn_enabled_op_types_;
+  // For workspace_size, refer it from here:
+  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
   int tensorrt_workspace_size_;
+  // While TensorRT allows an engine optimized for a given max batch size
+  // to run at any smaller size, the performance for those smaller
+  // sizes may not be as well-optimized. Therefore, Max batch is best
+  // equivalent to the runtime batch size.
   int tensorrt_max_batchsize_;
-  std::unique_ptr<PassStrategy> pass_builder_;
+  //  We transform the Ops that can be converted into TRT layer in the model,
+  //  and aggregate these Ops into subgraphs for TRT execution.
+  //  We set this variable to control the minimum number of nodes in the
+  //  subgraph, 3 as default value.
+  int tensorrt_min_subgraph_size_{3};
+
+  bool use_mkldnn_{false};
+  std::unordered_set<std::string> mkldnn_enabled_op_types_;
+
   bool model_from_memory_{false};
-};
 
-// Configurations for Anakin engine.
-struct AnakinConfig : public PaddlePredictor::Config {
-  enum TargetType { NVGPU = 0, X86 };
-  int device;
-  std::string model_file;
-  int max_batch_size{-1};
-  TargetType target_type;
+  bool enable_ir_optim_{true};
+  bool use_feed_fetch_ops_{true};
+  bool ir_debug_{false};
+
+  bool specify_input_name_{false};
+
+  int cpu_math_library_num_threads_{1};
+
+  // A runtime cache, shouldn't be transferred to others.
+  std::string serialized_info_cache_;
+
+  mutable std::unique_ptr<PassStrategy> pass_builder_;
 };
 
 }  // namespace contrib
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 1513a4b3b4..3642f36127 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -13,61 +13,76 @@
 // limitations under the License.
 #pragma once
 
+/*! \file paddle_api.h
+ */
+
 #include <cassert>
 #include <memory>
 #include <string>
 #include <vector>
 
+/*! \namespace paddle
+ */
 namespace paddle {
 
-// Data type.
+/** paddle data type.
+ */
 enum PaddleDType {
   FLOAT32,
   INT64,
   // TODO(Superjomn) support more data types if needed.
 };
 
-/*
- * Memory menage for PaddleTensor.
- * The PaddleBuf holds a buffer for data input or output. The memory can be
- * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- * should be reused for better performance.
+/**
+ *\brief Memory menager for PaddleTensor.
  *
- * For user allocated memory, the following API can be used:
- * - PaddleBuf(void* data, size_t length) to set an external memory by
- * specifying
- *   the memory address and length.
- * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
- * memory.
- * ATTENTION, for user allocated memory, deallocation should be done by users
- * externally after the program finished. The PaddleBuf won't do any allocation
- * or deallocation.
+ *The PaddleBuf holds a buffer for data input or output. The memory can be
+ *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+ *should be reused for better performance.
  *
- * To have the PaddleBuf allocate and manage the memory:
- * - PaddleBuf(size_t length) will allocate a memory of size `length`.
- * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
- *   if the allocated memory is larger than `length`, nothing will done.
+ *For user allocated memory, the following API can be used:
+ *- PaddleBuf(void* data, size_t length) to set an external memory by
+ *specifying
+ *  the memory address and length.
+ *- Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ *memory.
+ *ATTENTION, for user allocated memory, deallocation should be done by users
+ *externally after the program finished. The PaddleBuf won't do any allocation
+ *or deallocation.
+ *
+ *To have the PaddleBuf allocate and manage the memory:
+ *- PaddleBuf(size_t length) will allocate a memory of size `length`.
+ *- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ *  if the allocated memory is larger than `length`, nothing will done.
  */
 class PaddleBuf {
  public:
-  // PaddleBuf allocate memory internally, and manage it.
+  /** PaddleBuf allocate memory internally, and manage it.
+   */
   explicit PaddleBuf(size_t length)
       : data_(new char[length]), length_(length), memory_owned_(true) {}
-  // Set external memory, the PaddleBuf won't manage it.
+  /** Set external memory, the PaddleBuf won't manage it.
+   */
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
-  // Copy only available when memory is managed externally.
+  /** Copy only available when memory is managed externally.
+   */
   explicit PaddleBuf(const PaddleBuf&);
 
-  // Resize the memory.
+  /** Resize the memory.
+   */
   void Resize(size_t length);
-  // Reset to external memory, with address and length set.
+  /** Reset to external memory, with address and length set.
+   */
   void Reset(void* data, size_t length);
-  // Tell whether the buffer is empty.
+  /** Tell whether the buffer is empty.
+   */
   bool empty() const { return length_ == 0; }
-  // Get the memory address.
+  /** Get the memory address.
+   */
   void* data() const { return data_; }
-  // Get the memory length.
+  /** Get the memory length.
+   */
   size_t length() const { return length_; }
 
   ~PaddleBuf() { Free(); }
@@ -83,7 +98,8 @@ class PaddleBuf {
   bool memory_owned_{true};
 };
 
-// Basic input and output data structure for PaddlePredictor.
+/** Basic input and output data structure for PaddlePredictor.
+ */
 struct PaddleTensor {
   PaddleTensor() = default;
   std::string name;  // variable name.
@@ -94,19 +110,22 @@ struct PaddleTensor {
 };
 
 enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
-// Tensor without copy, currently only supports AnalysisPredictor.
+/** Tensor without copy, currently only supports AnalysisPredictor.
+ */
 class ZeroCopyTensor {
  public:
   void Reshape(const std::vector<int>& shape);
 
-  // Get the memory in CPU or GPU with specific data type, should Reshape first
-  // to tell the data size.
-  // Once can directly call this data to feed the data.
-  // This is for write the input tensor.
+  /** Get the memory in CPU or GPU with specific data type, should Reshape first
+   * to tell the data size.
+   * Once can directly call this data to feed the data.
+   * This is for write the input tensor.
+   */
   template <typename T>
   T* mutable_data(PaddlePlace place);
-  // Get the memory directly, will return the place and memory size by pointer.
-  // This is for reading the output tensor.
+  /** Get the memory directly, will return the place and memory size by pointer.
+   * This is for reading the output tensor.
+   */
   template <typename T>
   T* data(PaddlePlace* place, int* size) const;
 
@@ -128,8 +147,7 @@ class ZeroCopyTensor {
   void* scope_{nullptr};
 };
 
-/*
- * A simple Inference API for Paddle.
+/** A simple Inference API for Paddle.
  */
 class PaddlePredictor {
  public:
@@ -138,18 +156,20 @@ class PaddlePredictor {
   PaddlePredictor(const PaddlePredictor&) = delete;
   PaddlePredictor& operator=(const PaddlePredictor&) = delete;
 
-  // Predict an record.
-  // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be available until Run returns. Caller should be
-  // responsible for the output tensor's buffer, either allocated or passed from
-  // outside.
+  /** Predict an record.
+   * The caller should be responsible for allocating and releasing the memory of
+   * `inputs`. `inputs` should be available until Run returns. Caller should be
+   * responsible for the output tensor's buffer, either allocated or passed from
+   * outside.
+   */
   virtual bool Run(const std::vector<PaddleTensor>& inputs,
                    std::vector<PaddleTensor>* output_data,
                    int batch_size = -1) = 0;
 
-  // Zero copy input and output optimization.
-  // Get the input or output tensors, and operate on their memory directly,
-  // without copy.
+  /** Zero copy input and output optimization.
+   * Get the input or output tensors, and operate on their memory directly,
+   * without copy.
+   */
   virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
       const std::string& name) {
     return nullptr;
@@ -160,16 +180,19 @@ class PaddlePredictor {
   }
   virtual bool ZeroCopyRun() { return false; }
 
-  // Clone a predictor that share the model weights, the Cloned predictor should
-  // be thread-safe.
+  /** Clone a predictor that share the model weights, the Cloned predictor
+   * should be thread-safe.
+   */
   virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
 
-  // Destroy the Predictor.
+  /** Destroy the Predictor.
+   */
   virtual ~PaddlePredictor() = default;
 
-  // The common configs for all the predictors.
+  /** The common configs for all the predictors.
+   */
   struct Config {
-    std::string model_dir;  // path to the model directory.
+    std::string model_dir; /*!< path to the model directory. */
   };
 };
 
@@ -177,17 +200,21 @@ struct NativeConfig : public PaddlePredictor::Config {
   // GPU related fields.
   bool use_gpu{false};
   int device{0};
-  float fraction_of_gpu_memory{-1.f};  // Change to a float in (0,1] if needed.
+  float fraction_of_gpu_memory{
+      -1.f}; /*!< Change to a float in (0,1] if needed. */
 
   // Specify the exact path of program and parameter files.
   std::string prog_file;
   std::string param_file;
 
-  // Specify the variable's name of each input if input tensors don't follow the
-  // `feeds` and `fetches` of the phase `save_inference_model`.
+  /** Specify the variable's name of each input if input tensors don't follow
+   * the
+   * `feeds` and `fetches` of the phase `save_inference_model`.
+   */
   bool specify_input_name{false};
 
-  // Set and get the number of cpu math library threads.
+  /** Set and get the number of cpu math library threads.
+   */
   void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
     cpu_math_library_num_threads_ = cpu_math_library_num_threads;
   }
@@ -201,28 +228,33 @@ struct NativeConfig : public PaddlePredictor::Config {
   int cpu_math_library_num_threads_{1};
 };
 
-// A factory to help create different predictors.
-//
-// Usage:
-//
-// NativeConfig config;
-// ... // change the configs.
-// auto native_predictor = CreatePaddlePredictor(config);
-//
-// FOR EXTENSION DEVELOPER:
-// Different predictors are designated by config type. Similar configs can be
-// merged, but there shouldn't be a huge config containing different fields for
-// more than one kind of predictors.
+/*! \fn std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&
+ * config);
+ *
+ * \brief A factory to help create different predictors.
+ *
+ * Usage:
+ *
+ * NativeConfig config;
+ * ... // change the configs.
+ * auto native_predictor = CreatePaddlePredictor(config);
+ *
+ * FOR EXTENSION DEVELOPER:
+ * Different predictors are designated by config type. Similar configs can be
+ * merged, but there shouldn't be a huge config containing different fields for
+ * more than one kind of predictors.
+ */
 template <typename ConfigT>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 
-// NOTE The following APIs are too trivial, we will discard it in the following
-// versions.
+/** NOTE The following APIs are too trivial, we will discard it in the following
+ * versions.
+ */
 enum class PaddleEngineKind {
-  kNative = 0,         // Use the native Fluid facility.
-  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-  kAnalysis,           // More optimization.
-  kAnakin              // Use Anakin for inference, not mature yet.
+  kNative = 0,        /*!< Use the native Fluid facility. */
+  kAutoMixedTensorRT, /*!< Automatically mix Fluid with TensorRT. */
+  kAnalysis,          /*!< More optimization. */
+  kAnakin             /*!< Use Anakin for inference, not mature yet. */
 };
 
 template <typename ConfigT, PaddleEngineKind engine>
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 92fb51d647..1785bd520a 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -26,9 +26,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle_api.h"  // NOLINT
-#ifndef WITH_ANAKIN
 #include "paddle_analysis_config.h"  // NOLINT
-#else
+#include "paddle_api.h"              // NOLINT
+#ifdef WITH_ANAKIN
 #include "paddle_anakin_config.h"  // NOLINT
 #endif
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 40ca0d287c..9337ae55b7 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -18,30 +18,39 @@
 #include <string>
 #include <vector>
 
+/*! \file */
+
+/*! \namespace paddle */
 namespace paddle {
-/*
- * This is a pass builder based on string. It is part of inference API.
+
+/** This is a pass builder based on string. It is part of inference API.
  */
 class PaddlePassBuilder {
  public:
   explicit PaddlePassBuilder(const std::vector<std::string> &passes)
       : passes_(passes) {}
 
+  /** Append a pass to the end of the passes. */
   void AppendPass(const std::string &pass_type);
 
+  /** Insert a pass to a specific position.
+   * @param idx the position to insert.
+   * @param pass_type the pass key.
+   */
   void InsertPass(size_t idx, const std::string &pass_type);
 
-  // Delete the `idx`-th pass.
+  /** Delete the `idx`-th pass. */
   void DeletePass(size_t idx);
 
-  // Delete all the passes that has type `pass_type`.
+  /** Delete all the passes that has type `pass_type`. */
   void DeletePass(const std::string &pass_type);
 
-  // Visualize the computation graph after each pass by generating a DOT
-  // language file, one can draw them with the Graphviz toolkit.
+  /** Visualize the computation graph after each pass by generating a DOT
+   * language file, one can draw them with the Graphviz toolkit.
+   */
   void TurnOnDebug();
 
-  // Human-readible information.
+  /** Human-readible information. */
   std::string DebugString();
 
   const std::vector<std::string> &AllPasses() const { return passes_; }
@@ -50,23 +59,27 @@ class PaddlePassBuilder {
   std::vector<std::string> passes_;
 };
 
-/*
- * Pass strategy to help control the IR passes.
+/**Pass strategy to help control the IR passes.
  */
 class PassStrategy : public PaddlePassBuilder {
  public:
   explicit PassStrategy(const std::vector<std::string> &passes)
       : PaddlePassBuilder(passes) {}
 
-  // The MKLDNN control exists in both CPU and GPU mode, because there can be
-  // still some CPU kernels running in CPU mode.
+  /** The MKLDNN control exists in both CPU and GPU mode, because there can be
+   * still some CPU kernels running in CPU mode.
+   */
   virtual void EnableMKLDNN() = 0;
 
+  bool use_gpu() const { return use_gpu_; }
+
   virtual ~PassStrategy() = default;
+
+ protected:
+  bool use_gpu_{false};
 };
 
-/*
- * The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
+/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
  */
 class CpuPassStrategy : public PassStrategy {
  public:
@@ -88,6 +101,7 @@ class CpuPassStrategy : public PassStrategy {
         "conv_eltwiseadd_bn_fuse_pass",  //
         "is_test_pass",                  //
     });
+    use_gpu_ = false;
   }
 
   virtual ~CpuPassStrategy() = default;
@@ -111,23 +125,28 @@ class CpuPassStrategy : public PassStrategy {
   CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
 };
 
-/*
- * The GPU passes strategy, it is used in
+/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
  */
 class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
     passes_.assign({
-        "infer_clean_graph_pass",               //
-        "conv_bn_fuse_pass",                    //
-        "conv_elementwise_add_act_fuse_pass",   //
-        "conv_elementwise_add2_act_fuse_pass",  //
-        "conv_elementwise_add_fuse_pass",       //
+        "infer_clean_graph_pass",                    //
+        "conv_affine_channel_fuse_pass",             //
+        "conv_eltwiseadd_affine_channel_fuse_pass",  //
+        "conv_bn_fuse_pass",                         //
+        "conv_elementwise_add_act_fuse_pass",        //
+        "conv_elementwise_add2_act_fuse_pass",       //
+        "conv_elementwise_add_fuse_pass",            //
     });
+
+    use_gpu_ = true;
   }
 
   GpuPassStrategy(const GpuPassStrategy &other)
-      : PassStrategy(other.AllPasses()) {}
+      : PassStrategy(other.AllPasses()) {
+    use_gpu_ = true;
+  }
 
   void EnableMKLDNN() override;
 
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index 17f6c6d9f1..9afeafd176 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,5 @@
 nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
+nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(plugin)
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
new file mode 100644
index 0000000000..9fecad6eb3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/op_teller.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Just tell by the op_types.
+struct SimpleOpTypeSetTeller : public Teller {
+  SimpleOpTypeSetTeller() {}
+
+  bool operator()(const std::string& op_type,
+                  const framework::OpDesc& desc) override {
+    return teller_set.count(op_type);
+  }
+
+ private:
+  std::unordered_set<std::string> teller_set{
+      {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+       "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+       "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
+       "conv2d_transpose", "leaky_relu"}};
+};
+
+bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
+  for (auto& teller : tellers_) {
+    if ((*teller)(op_type, desc)) return true;
+  }
+  return false;
+}
+
+OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
new file mode 100644
index 0000000000..b98f052bf2
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Single Op teller definition.
+ * One can override this and define a more complex tell logic, considerring more
+ * issues such as op_desc.
+ */
+struct Teller {
+  virtual bool operator()(const std::string& op_type,
+                          const framework::OpDesc& desc) = 0;
+
+  virtual ~Teller() = default;
+};
+/*
+ * A real example:
+ *
+ * struct SomeTeller : public Teller {
+ * bool operator()(const std::string& op_type,
+ *                const framework::OpDesc& desc) override {
+ *  return op_type == "fc" && desc.Inputs().size() == 2;
+ * }
+ *};
+ */
+
+/*
+ * class OpTeller helps to tell whether a fluid
+ * operator can be transformed to a TensorRT layer.
+ */
+class OpTeller {
+ public:
+  static OpTeller& Global() {
+    static std::unique_ptr<OpTeller> x(new OpTeller);
+    return *x;
+  }
+
+  bool Tell(const std::string& op_type, const framework::OpDesc& desc);
+
+ private:
+  OpTeller();
+
+ private:
+  std::vector<std::unique_ptr<Teller>> tellers_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 95bbc74a59..131712ca88 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -41,7 +41,7 @@ endfunction()
 if(NOT APPLE AND WITH_MKLML)
     set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
     download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
+    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc SERIAL)
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
     # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
+inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL)
 
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
 download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1)
+        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1 SERIAL)
 
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
@@ -90,6 +90,11 @@ set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
 download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
 
+# seq_pool1
+set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
+download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
@@ -106,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
 
 # resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
-  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
+  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)
 
 # mobilenet with depthwise_conv op
 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
-  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
+  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
 
 # anakin
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 12d61d06ce..5ad6e4a857 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -165,12 +165,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(contrib::AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim(true);
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 142801382b..b9666e01ad 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -98,20 +98,17 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   auto one_batch = data->NextBatch();
   PaddleTensor input_tensor;
   input_tensor.name = "word";
-  input_tensor.shape.assign({static_cast<int>(one_batch.data.size()), 1});
-  input_tensor.lod.assign({one_batch.lod});
   input_tensor.dtype = PaddleDType::INT64;
-  TensorAssignData<int64_t>(&input_tensor, {one_batch.data});
+  TensorAssignData<int64_t>(&input_tensor, {one_batch.data}, one_batch.lod);
   PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
   input_slots->assign({input_tensor});
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
index 8aaab6d664..1318fbcbc4 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -19,11 +19,9 @@ namespace inference {
 using contrib::AnalysisConfig;
 
 struct DataRecord {
-  std::vector<std::vector<int64_t>> query_data_all, title_data_all;
+  std::vector<std::vector<int64_t>> query, title;
   std::vector<size_t> lod1, lod2;
-  size_t batch_iter{0};
-  size_t batch_size{1};
-  size_t num_samples;  // total number of samples
+  size_t batch_iter{0}, batch_size{1}, num_samples;  // total number of samples
   DataRecord() = default;
   explicit DataRecord(const std::string &path, int batch_size = 1)
       : batch_size(batch_size) {
@@ -33,22 +31,9 @@ struct DataRecord {
     DataRecord data;
     size_t batch_end = batch_iter + batch_size;
     // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= query_data_all.size()) {
-      data.query_data_all.assign(query_data_all.begin() + batch_iter,
-                                 query_data_all.begin() + batch_end);
-      data.title_data_all.assign(title_data_all.begin() + batch_iter,
-                                 title_data_all.begin() + batch_end);
-      // Prepare LoDs
-      data.lod1.push_back(0);
-      data.lod2.push_back(0);
-      CHECK(!data.query_data_all.empty());
-      CHECK(!data.title_data_all.empty());
-      CHECK_EQ(data.query_data_all.size(), data.title_data_all.size());
-      for (size_t j = 0; j < data.query_data_all.size(); j++) {
-        // calculate lod
-        data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size());
-        data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size());
-      }
+    if (batch_end <= query.size()) {
+      GetInputPerBatch(query, &data.query, &data.lod1, batch_iter, batch_end);
+      GetInputPerBatch(title, &data.title, &data.lod2, batch_iter, batch_end);
     }
     batch_iter += batch_size;
     return data;
@@ -67,8 +52,8 @@ struct DataRecord {
       // load title data
       std::vector<int64_t> title_data;
       split_to_int64(data[1], ' ', &title_data);
-      query_data_all.push_back(std::move(query_data));
-      title_data_all.push_back(std::move(title_data));
+      query.push_back(std::move(query_data));
+      title.push_back(std::move(title_data));
     }
     num_samples = num_lines;
   }
@@ -80,15 +65,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   lod_query_tensor.name = "left";
   lod_title_tensor.name = "right";
   auto one_batch = data->NextBatch();
-  int size1 = one_batch.lod1[one_batch.lod1.size() - 1];  // token batch size
-  int size2 = one_batch.lod2[one_batch.lod2.size() - 1];  // token batch size
-  lod_query_tensor.shape.assign({size1, 1});
-  lod_query_tensor.lod.assign({one_batch.lod1});
-  lod_title_tensor.shape.assign({size2, 1});
-  lod_title_tensor.lod.assign({one_batch.lod2});
   // assign data
-  TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query_data_all);
-  TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title_data_all);
+  TensorAssignData<int64_t>(&lod_query_tensor, one_batch.query, one_batch.lod1);
+  TensorAssignData<int64_t>(&lod_title_tensor, one_batch.title, one_batch.lod2);
   // Set inputs.
   input_slots->assign({lod_query_tensor, lod_title_tensor});
   for (auto &tensor : *input_slots) {
@@ -97,11 +76,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(contrib::AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index f19a2ed59e..6fef79dc46 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -19,11 +19,9 @@ namespace inference {
 using contrib::AnalysisConfig;
 
 struct DataRecord {
-  std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
+  std::vector<std::vector<int64_t>> word, mention;
   std::vector<size_t> lod;  // two inputs have the same lod info.
-  size_t batch_iter{0};
-  size_t batch_size{1};
-  size_t num_samples;  // total number of samples
+  size_t batch_iter{0}, batch_size{1}, num_samples;  // total number of samples
   DataRecord() = default;
   explicit DataRecord(const std::string &path, int batch_size = 1)
       : batch_size(batch_size) {
@@ -33,20 +31,10 @@ struct DataRecord {
     DataRecord data;
     size_t batch_end = batch_iter + batch_size;
     // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= word_data_all.size()) {
-      data.word_data_all.assign(word_data_all.begin() + batch_iter,
-                                word_data_all.begin() + batch_end);
-      data.mention_data_all.assign(mention_data_all.begin() + batch_iter,
-                                   mention_data_all.begin() + batch_end);
-      // Prepare LoDs
-      data.lod.push_back(0);
-      CHECK(!data.word_data_all.empty());
-      CHECK(!data.mention_data_all.empty());
-      CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size());
-      for (size_t j = 0; j < data.word_data_all.size(); j++) {
-        // calculate lod
-        data.lod.push_back(data.lod.back() + data.word_data_all[j].size());
-      }
+    if (batch_end <= word.size()) {
+      GetInputPerBatch(word, &data.word, &data.lod, batch_iter, batch_end);
+      GetInputPerBatch(mention, &data.mention, &data.lod, batch_iter,
+                       batch_end);
     }
     batch_iter += batch_size;
     return data;
@@ -65,27 +53,22 @@ struct DataRecord {
       // load mention data
       std::vector<int64_t> mention_data;
       split_to_int64(data[3], ' ', &mention_data);
-      word_data_all.push_back(std::move(word_data));
-      mention_data_all.push_back(std::move(mention_data));
+      word.push_back(std::move(word_data));
+      mention.push_back(std::move(mention_data));
     }
     num_samples = num_lines;
   }
 };
 
-void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
-                   int batch_size) {
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
   PaddleTensor lod_word_tensor, lod_mention_tensor;
   lod_word_tensor.name = "word";
   lod_mention_tensor.name = "mention";
   auto one_batch = data->NextBatch();
-  int size = one_batch.lod[one_batch.lod.size() - 1];  // token batch size
-  lod_word_tensor.shape.assign({size, 1});
-  lod_word_tensor.lod.assign({one_batch.lod});
-  lod_mention_tensor.shape.assign({size, 1});
-  lod_mention_tensor.lod.assign({one_batch.lod});
   // assign data
-  TensorAssignData<int64_t>(&lod_word_tensor, one_batch.word_data_all);
-  TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.mention_data_all);
+  TensorAssignData<int64_t>(&lod_word_tensor, one_batch.word, one_batch.lod);
+  TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.mention,
+                            one_batch.lod);
   // Set inputs.
   input_slots->assign({lod_word_tensor, lod_mention_tensor});
   for (auto &tensor : *input_slots) {
@@ -101,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
     cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
                         buffer_param.size());
   } else {
-    cfg->prog_file = FLAGS_infer_model + "/__model__";
-    cfg->param_file = FLAGS_infer_model + "/param";
+    cfg->SetModel(FLAGS_infer_model + "/__model__",
+                  FLAGS_infer_model + "/param");
   }
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -116,7 +98,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
   int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
   LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
   for (int bid = 0; bid < epoch; ++bid) {
-    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    PrepareInputs(&input_slots, &data);
     (*inputs).emplace_back(input_slots);
   }
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 764ae5ed85..629981d565 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -21,12 +21,10 @@ namespace inference {
 namespace analysis {
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->param_file = FLAGS_infer_model + "/params";
-  cfg->prog_file = FLAGS_infer_model + "/model";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->enable_ir_optim = true;
-  cfg->specify_input_name = true;
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim();
+  cfg->SwitchSpecifyInputNames();
   cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 17f4587a50..3c52afbfb8 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor,
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
@@ -225,10 +223,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 TEST(Analyzer_rnn1, profile) {
-  contrib::AnalysisConfig cfg(false);
+  contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
-  cfg.fraction_of_gpu_memory = 0.1;
-  cfg.pass_builder()->TurnOnDebug();
+  cfg.DisableGpu();
+  cfg.SwitchIrDebug();
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) {
 TEST(Analyzer_rnn1, ZeroCopy) {
   AnalysisConfig config;
   SetConfig(&config);
-  config.use_feed_fetch_ops = false;
+  config.SwitchUseFeedFetchOps(false);
 
   PaddlePlace place;
 
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
-  config.use_feed_fetch_ops = true;
-  auto native_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  config.SwitchUseFeedFetchOps(true);
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
 
-  config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
+  config.SwitchUseFeedFetchOps(
+      true);  // the analysis predictor needs feed/fetch.
   auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
 #define NEW_TENSOR(name__) \
@@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) {
 TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
   AnalysisConfig config;
   SetConfig(&config);
-  config.use_feed_fetch_ops = false;
+  config.SwitchUseFeedFetchOps(false);
 
 #define NEW_TENSOR(name__) \
   auto name__##_tensor = predictor->GetInputTensor(#name__);
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index f8354e7687..007f9f0b66 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -105,12 +105,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index f5082cd60f..47c1d73758 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -18,12 +18,9 @@ namespace paddle {
 namespace inference {
 
 struct DataRecord {
-  std::vector<std::vector<int64_t>> title1_all, title2_all, title3_all, l1_all;
   std::vector<std::vector<int64_t>> title1, title2, title3, l1;
-  std::vector<size_t> title1_lod, title2_lod, title3_lod, l1_lod;
-  size_t batch_iter{0};
-  size_t batch_size{1};
-  size_t num_samples;  // total number of samples
+  std::vector<size_t> lod1, lod2, lod3, l1_lod;
+  size_t batch_iter{0}, batch_size{1}, num_samples;  // total number of samples
   DataRecord() = default;
   explicit DataRecord(const std::string &path, int batch_size = 1)
       : batch_size(batch_size) {
@@ -33,41 +30,11 @@ struct DataRecord {
     DataRecord data;
     size_t batch_end = batch_iter + batch_size;
     // NOTE skip the final batch, if no enough data is provided.
-    if (batch_end <= title1_all.size()) {
-      data.title1_all.assign(title1_all.begin() + batch_iter,
-                             title1_all.begin() + batch_end);
-      data.title2_all.assign(title2_all.begin() + batch_iter,
-                             title2_all.begin() + batch_end);
-      data.title3_all.assign(title3_all.begin() + batch_iter,
-                             title3_all.begin() + batch_end);
-      data.l1_all.assign(l1_all.begin() + batch_iter,
-                         l1_all.begin() + batch_end);
-      // Prepare LoDs
-      data.title1_lod.push_back(0);
-      data.title2_lod.push_back(0);
-      data.title3_lod.push_back(0);
-      data.l1_lod.push_back(0);
-      CHECK(!data.title1_all.empty());
-      CHECK(!data.title2_all.empty());
-      CHECK(!data.title3_all.empty());
-      CHECK(!data.l1_all.empty());
-      CHECK_EQ(data.title1_all.size(), data.title2_all.size());
-      CHECK_EQ(data.title1_all.size(), data.title3_all.size());
-      CHECK_EQ(data.title1_all.size(), data.l1_all.size());
-      for (size_t j = 0; j < data.title1_all.size(); j++) {
-        data.title1.push_back(data.title1_all[j]);
-        data.title2.push_back(data.title2_all[j]);
-        data.title3.push_back(data.title3_all[j]);
-        data.l1.push_back(data.l1_all[j]);
-        // calculate lod
-        data.title1_lod.push_back(data.title1_lod.back() +
-                                  data.title1_all[j].size());
-        data.title2_lod.push_back(data.title2_lod.back() +
-                                  data.title2_all[j].size());
-        data.title3_lod.push_back(data.title3_lod.back() +
-                                  data.title3_all[j].size());
-        data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size());
-      }
+    if (batch_end <= title1.size()) {
+      GetInputPerBatch(title1, &data.title1, &data.lod1, batch_iter, batch_end);
+      GetInputPerBatch(title2, &data.title2, &data.lod2, batch_iter, batch_end);
+      GetInputPerBatch(title3, &data.title3, &data.lod3, batch_iter, batch_end);
+      GetInputPerBatch(l1, &data.l1, &data.l1_lod, batch_iter, batch_end);
     }
     batch_iter += batch_size;
     return data;
@@ -92,10 +59,10 @@ struct DataRecord {
       // load l1 data
       std::vector<int64_t> l1_data;
       split_to_int64(data[3], ' ', &l1_data);
-      title1_all.push_back(std::move(title1_data));
-      title2_all.push_back(std::move(title2_data));
-      title3_all.push_back(std::move(title3_data));
-      l1_all.push_back(std::move(l1_data));
+      title1.push_back(std::move(title1_data));
+      title2.push_back(std::move(title2_data));
+      title3.push_back(std::move(title3_data));
+      l1.push_back(std::move(l1_data));
     }
     num_samples = num_lines;
   }
@@ -109,24 +76,11 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   title3_tensor.name = "title3";
   l1_tensor.name = "l1";
   auto one_batch = data->NextBatch();
-  int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1];
-  title1_tensor.shape.assign({title1_size, 1});
-  title1_tensor.lod.assign({one_batch.title1_lod});
-  int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1];
-  title2_tensor.shape.assign({title2_size, 1});
-  title2_tensor.lod.assign({one_batch.title2_lod});
-  int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1];
-  title3_tensor.shape.assign({title3_size, 1});
-  title3_tensor.lod.assign({one_batch.title3_lod});
-  int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1];
-  l1_tensor.shape.assign({l1_size, 1});
-  l1_tensor.lod.assign({one_batch.l1_lod});
-
   // assign data
-  TensorAssignData<int64_t>(&title1_tensor, one_batch.title1);
-  TensorAssignData<int64_t>(&title2_tensor, one_batch.title2);
-  TensorAssignData<int64_t>(&title3_tensor, one_batch.title3);
-  TensorAssignData<int64_t>(&l1_tensor, one_batch.l1);
+  TensorAssignData<int64_t>(&title1_tensor, one_batch.title1, one_batch.lod1);
+  TensorAssignData<int64_t>(&title2_tensor, one_batch.title2, one_batch.lod2);
+  TensorAssignData<int64_t>(&title3_tensor, one_batch.title3, one_batch.lod3);
+  TensorAssignData<int64_t>(&l1_tensor, one_batch.l1, one_batch.l1_lod);
   // Set inputs.
   input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor});
   for (auto &tensor : *input_slots) {
@@ -135,11 +89,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
new file mode 100644
index 0000000000..a1742f6068
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -0,0 +1,186 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct OneSlotInBatch {
+  std::string name;
+  std::vector<std::vector<float>> data;
+  std::vector<int> shape;
+  std::vector<size_t> lod;
+};
+
+struct DataRecord {
+  std::vector<std::vector<OneSlotInBatch>> batched_data;
+  std::map<std::string, std::vector<std::vector<float>>> datasets;
+  size_t batch_iter{0}, num_samples;  // total number of samples
+
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1) {
+    Load(path);
+    Prepare(batch_size);
+  }
+
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    constexpr int num_slots = 154;
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, '\t', &data);
+      std::vector<float> slot_data;
+      split_to_float(data[1], ' ', &slot_data);
+      std::string name = data[0];
+      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0,
+                        "line %d, %s should be divisible", num_lines, name);
+      datasets[name].emplace_back(std::move(slot_data));
+    }
+    num_samples = num_lines / num_slots;
+    PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
+                      "num samples should be divisible");
+    PADDLE_ENFORCE_GT(num_samples, 0);
+  }
+
+  void Prepare(int bs) {
+    for (auto it = datasets.begin(); it != datasets.end(); ++it) {
+      PADDLE_ENFORCE_EQ(it->second.size(), num_samples,
+                        "size of each slot should be equal");
+    }
+    size_t num_batches = num_samples / bs;
+    EXPECT_GT(num_batches, 0);
+    batched_data.resize(num_batches);
+    for (auto &one_batch : batched_data) {
+      one_batch.resize(datasets.size());
+      size_t i = 0;
+      for (auto it = datasets.begin(); it != datasets.end(); ++it) {
+        auto &slot = one_batch[i];
+        slot.name = it->first;
+        slot.data.resize(bs);
+        slot.lod.resize(bs + 1);
+        slot.lod[0] = 0;
+        auto &lod = slot.lod;
+        auto &datas = it->second;
+        for (int k = 0; k < bs; ++k) {
+          size_t id = k + batch_iter * bs;
+          std::copy(datas[id].begin(), datas[id].end(),
+                    std::back_inserter(slot.data[k]));
+          size_t len = datas[id].size() / 11;
+          PADDLE_ENFORCE_EQ(len * 11, datas[id].size(),
+                            "%s %d size should be divisible", slot.name, id);
+          lod[k + 1] = lod[k] + len;
+        }
+        slot.shape.assign({static_cast<int>(lod[bs]), 11});
+        i++;
+      }
+    }
+  }
+
+  const std::vector<OneSlotInBatch> &NextBatch() {
+    if (batch_iter >= batched_data.size() - 1) {
+      batch_iter = -1;
+    }
+    return batched_data[++batch_iter];
+  }
+};
+
+static void TensorAssignSlot(PaddleTensor *tensor, const OneSlotInBatch &slot) {
+  tensor->name = slot.name + "_embed";
+  tensor->shape = slot.shape;
+  tensor->dtype = PaddleDType::FLOAT32;
+  tensor->lod.clear();
+  tensor->lod.emplace_back(slot.lod);
+  TensorAssignData(tensor, slot.data);
+}
+
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data) {
+  const auto &one_batch = data->NextBatch();
+  input_slots->resize(one_batch.size());
+  for (size_t i = 0; i < one_batch.size(); ++i) {
+    auto &slot = one_batch[i];
+    TensorAssignSlot(&((*input_slots)[i]), slot);
+  }
+}
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->pass_builder()->TurnOnDebug();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.batched_data.size() : 1;
+  LOG(INFO) << "number of samples: "
+            << data.batched_data.size() * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data);
+    (*inputs).emplace_back(input_slots);
+  }
+}
+
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
+  std::vector<PaddleTensor> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+
+TEST(Analyzer_seq_pool1, profile) { profile(); }
+
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_seq_pool1, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+// Check the fuse status
+TEST(Analyzer_seq_pool1, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+  LOG(INFO) << "num_ops: " << num_ops;
+  EXPECT_EQ(num_ops, 349);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 79f3c81ade..7b448a3200 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -47,11 +47,10 @@ struct DataReader {
 };
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->model_dir = FLAGS_infer_model;
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->specify_input_name = true;
-  cfg->enable_ir_optim = true;
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->DisableGpu();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index d73bccefd5..5a77b53a85 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) {
 }
 
 void SetConfig(AnalysisConfig *cfg) {
-  cfg->param_file = FLAGS_infer_model + "/__params__";
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->use_gpu = false;
-  cfg->device = 0;
-  cfg->enable_ir_optim = true;
-  cfg->specify_input_name = true;
+  cfg->SetModel(FLAGS_infer_model + "/__model__",
+                FLAGS_infer_model + "/__params__");
+  cfg->DisableGpu();
+  cfg->SwitchIrDebug();
+  cfg->SwitchSpecifyInputNames();
   // TODO(TJ): fix fusion gru
   cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
 }
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index 7046bce303..cf0f1d5c18 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -64,19 +64,23 @@ std::ostream &operator<<(std::ostream &os,
   num_spaces++;
   os << *reinterpret_cast<const NativeConfig *>(&config);
   if (!config.model_from_memory()) {
-    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
-    os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n";
+    os << GenSpaces(num_spaces) << "param_file: " << config.params_file()
+       << "\n";
   } else {
     os << GenSpaces(num_spaces)
        << "prog_file and param_file: load from memory \n";
   }
-  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
+  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
      << "\n";
+  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
+     << "\n";
+  os << GenSpaces(num_spaces)
+     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
   os << GenSpaces(num_spaces)
-     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n";
-  os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt()
+     << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n";
+  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled()
      << "\n";
-  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
   return os;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index b0c8f395ce..41d033df85 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -132,7 +132,8 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
 void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
                        const std::string &dirname, bool is_combined = true,
                        std::string model_filename = "model",
-                       std::string params_filename = "params") {
+                       std::string params_filename = "params",
+                       const std::vector<std::string> *feed_names = nullptr) {
   // Set fake_image_data
   PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
   std::vector<std::vector<int64_t>> feed_target_shapes = GetFeedTargetShapes(
@@ -146,29 +147,47 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
     os << "}\n";
   }
   LOG(INFO) << os.str();
-
-  int dim1 = feed_target_shapes[0][1];
-  int dim2 = feed_target_shapes[0][2];
-  int dim3 = feed_target_shapes[0][3];
-
-  PaddleTensor input;
-  std::vector<int> shape({FLAGS_batch_size, dim1, dim2, dim3});
-  input.shape = shape;
-  input.dtype = PaddleDType::FLOAT32;
-
-  // fill input data, for profile easily, do not use random data here.
-  size_t size = FLAGS_batch_size * dim1 * dim2 * dim3;
-  input.data.Resize(size * sizeof(float));
-  float *input_data = static_cast<float *>(input.data.data());
-  for (size_t i = 0; i < size; i++) {
-    *(input_data + i) = static_cast<float>(i) / size;
+  if (feed_names) {
+    PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size());
+  }
+  std::vector<PaddleTensor> input_slots(feed_target_shapes.size());
+  for (size_t i = 0; i < feed_target_shapes.size(); ++i) {
+    const auto &feed_shape = feed_target_shapes[i];
+    auto &input = input_slots[i];
+    std::vector<int> shape({FLAGS_batch_size});
+    for (size_t s = 1; s < feed_shape.size(); ++s) {
+      shape.push_back(static_cast<int>(feed_shape[s]));
+    }
+    if (feed_names) {
+      input.name = (*feed_names)[i];
+    }
+    input.shape = shape;
+    input.dtype = PaddleDType::FLOAT32;
+    size_t len = std::accumulate(shape.begin(), shape.end(), 1,
+                                 [](int a, int b) { return a * b; });
+    input.data.Resize(len * sizeof(float));
+    input.lod.assign({{0, static_cast<size_t>(FLAGS_batch_size)}});
+    float *input_data = static_cast<float *>(input.data.data());
+    // fill input data, for profile easily, do not use random data here.
+    for (size_t j = 0; j < len; ++j) {
+      *(input_data + j) = static_cast<float>(j) / len;
+    }
   }
-
-  std::vector<PaddleTensor> input_slots;
-  input_slots.assign({input});
   (*inputs).emplace_back(input_slots);
 }
 
+void GetInputPerBatch(const std::vector<std::vector<int64_t>> &in,
+                      std::vector<std::vector<int64_t>> *out,
+                      std::vector<size_t> *lod, size_t batch_iter,
+                      size_t batch_end) {
+  lod->clear();
+  lod->push_back(0);
+  for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) {
+    out->push_back(*it);
+    lod->push_back(lod->back() + (*it).size());  // calculate lod
+  }
+}
+
 void TestOneThreadPrediction(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
@@ -309,7 +328,10 @@ void CompareNativeAndAnalysis(
     const std::vector<std::vector<PaddleTensor>> &inputs) {
   PrintConfig(config, true);
   std::vector<PaddleTensor> native_outputs, analysis_outputs;
-  TestOneThreadPrediction(config, inputs, &native_outputs, false);
+  const auto *analysis_config =
+      reinterpret_cast<const contrib::AnalysisConfig *>(config);
+  auto native_config = analysis_config->ToNativeConfig();
+  TestOneThreadPrediction(&native_config, inputs, &native_outputs, false);
   TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
   CompareResult(analysis_outputs, native_outputs);
 }
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index d3bd035c1c..21df6eab81 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -46,22 +46,20 @@ void SetConfig<contrib::AnalysisConfig>(contrib::AnalysisConfig* config,
                                         std::string model_dir, bool use_gpu,
                                         bool use_tensorrt, int batch_size) {
   if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
-    config->prog_file = model_dir + "/" + FLAGS_prog_filename;
-    config->param_file = model_dir + "/" + FLAGS_param_filename;
+    config->SetModel(model_dir + "/" + FLAGS_prog_filename,
+                     model_dir + "/" + FLAGS_param_filename);
   } else {
-    config->model_dir = model_dir;
+    config->SetModel(model_dir);
   }
   if (use_gpu) {
-    config->use_gpu = true;
-    config->device = 0;
-    config->fraction_of_gpu_memory = 0.15;
+    config->EnableUseGpu(100, 0);
     if (use_tensorrt) {
       config->EnableTensorRtEngine(1 << 10, batch_size);
       config->pass_builder()->DeletePass("conv_bn_fuse_pass");
       config->pass_builder()->DeletePass("fc_fuse_pass");
       config->pass_builder()->TurnOnDebug();
     } else {
-      config->enable_ir_optim = true;
+      config->SwitchIrOptim();
     }
   }
 }
@@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
 
   std::vector<PaddleTensor> outputs;
   if (use_analysis || use_tensorrt) {
-    contrib::AnalysisConfig config(true);
+    contrib::AnalysisConfig config;
+    config.EnableUseGpu(100, 0);
     config.pass_builder()->TurnOnDebug();
     SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
                                        FLAGS_batch_size);
@@ -109,7 +108,8 @@ void compare(std::string model_dir, bool use_tensorrt) {
       &native_outputs, false);
 
   std::vector<PaddleTensor> analysis_outputs;
-  contrib::AnalysisConfig analysis_config(true);
+  contrib::AnalysisConfig analysis_config;
+  analysis_config.EnableUseGpu(50, 0);
   SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
                                      use_tensorrt, FLAGS_batch_size);
   TestOneThreadPrediction(
@@ -154,9 +154,9 @@ TEST(TensorRT_mobilenet, analysis) {
 
 TEST(AnalysisPredictor, use_gpu) {
   std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  AnalysisConfig config(true);
-  config.model_dir = model_dir;
-  config.fraction_of_gpu_memory = 0.15;
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
   config.pass_builder()->TurnOnDebug();
 
   std::vector<std::vector<PaddleTensor>> inputs_all;
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index ab3a30ce6b..29f0f034a2 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -3,14 +3,16 @@ set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
     "A path setting inference demo download directories.")
 function (inference_download install_dir url filename)
     message(STATUS "Download inference test stuff from ${url}/${filename}")
-    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
+    file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}")
     message(STATUS "finish downloading ${filename}")
 endfunction()
 
 function (inference_download_and_uncompress install_dir url filename)
     inference_download(${install_dir} ${url} ${filename})
-    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+    execute_process(
+            COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename}
+            WORKING_DIRECTORY ${install_dir}
+    )
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4a14eb941c..e53a6a562a 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -46,14 +46,14 @@ endif()
 register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 
 # warpctc_op needs cudnn 7 above
-if (WITH_GPU AND NOT WIN32)
+if (WITH_GPU)
     if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
     else()
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
     endif()
     # conv_fusion_op needs cudnn 7 above
-    if (NOT ${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+    if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
         op_library(conv_fusion_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
     endif()
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 855c4d7067..49e734ce96 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -64,7 +64,7 @@ class ClipByNormKernel : public framework::OpKernel<T> {
       output->mutable_data<T>(context.GetPlace());
     } else {
       PADDLE_THROW("Unexpected branch, input variable type is %s",
-                   in_var->Type().name());
+                   framework::ToTypeName(in_var->Type()));
     }
 
     PADDLE_ENFORCE_NOT_NULL(input);
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index 6446cab5ec..2e7f3edd55 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase {
     OpComment comment;
     PADDLE_ENFORCE(context->HasInput("X"),
                    "Input(X) of %s operator must not be null", comment.type);
-    auto dim_x = context->GetInputDim("X");
-
     context->SetOutputDim("Out", context->GetInputDim("X"));
     context->ShareLoD("X", "Out");
   }
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 48800947fd..0360cf5273 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -175,14 +175,13 @@ class WhileGradOp : public framework::OperatorBase {
         auto &og_inside =
             detail::Ref(cur_scope.Var(inside_og_name),
                         "Cannot find inside gradient %s", inside_og_name);
-        if (framework::IsType<framework::LoDTensor>(og_outside.Type())) {
+        if (og_outside.IsType<framework::LoDTensor>()) {
           auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
           auto &inside_tensor =
               detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
           inside_tensor.set_lod(outside_tensor.lod());
           inside_tensor.ShareDataWith(outside_tensor);
-        } else if (framework::IsType<framework::LoDTensorArray>(
-                       og_outside.Type())) {
+        } else if (og_outside.IsType<framework::LoDTensorArray>()) {
           auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
           auto &inside_array =
               detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
@@ -256,7 +255,7 @@ class WhileGradOp : public framework::OperatorBase {
                   var->IsType<LoDTensor>(),
               "Currently the type of var only can be LoDTensorArray, "
               "or LoDTensor, but the received var[%s] is %s.",
-              inside_grad_name, var->Type().name());
+              inside_grad_name, framework::ToTypeName(var->Type()));
 
           if (var->IsType<LoDTensor>()) {
             auto &inside_tensor = var->Get<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index 92d394eb3c..f172431e48 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -19,6 +19,10 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/cudnn_helper.h"
 
+DECLARE_uint64(conv_workspace_size_limit);
+DECLARE_bool(cudnn_exhaustive_search);
+DECLARE_int64(cudnn_exhaustive_search_times);
+
 namespace paddle {
 namespace operators {
 
@@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 template <typename TAlgorithm>
 class AlgorithmsCache {
  public:
+  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
   // Caches the best algorithm for a given
   // combination of tensor dimensions & compute data type.
   TAlgorithm GetAlgorithm(
@@ -54,9 +59,14 @@ class AlgorithmsCache {
       int algorithmFlags,  // can set for different data type
       std::function<TAlgorithm()> gen_func);
 
+  TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags,
+                          std::function<TAlgorithm()> gen_func);
+
  private:
   std::unordered_map<int64_t, TAlgorithm> hash_;
   std::mutex mutex_;
+
+  int search_times_;
 };
 
 template <typename TAlgorithm>
@@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
   return hash_[seed];
 }
 
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    int64_t area, int search_times, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  if (hash_.find(area) != hash_.end()) {
+    return hash_[area];
+  }
+  if (search_times_ < search_times) {
+    auto algo = gen_func();
+    hash_[area] = algo;
+    ++search_times_;
+    return algo;
+  }
+  TAlgorithm algo;
+  int64_t min = static_cast<uint64_t>(INT_MAX);
+  for (const auto& m : hash_) {
+    if (m.first < min) {
+      min = m.first;
+      algo = m.second;
+    }
+  }
+  return algo;
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_fusion_op.cc b/paddle/fluid/operators/conv_fusion_op.cc
index 9bdedb10e0..23b8087e78 100644
--- a/paddle/fluid/operators/conv_fusion_op.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cc
@@ -28,6 +28,8 @@ namespace operators {
 //         x is Input,
 //         z is ResidualData,
 //         bias is Bias
+// When `split_channels` is set, y will be splitted into multiple outputs,
+// each output has split_channels[i] number of channels.
 class Conv2DFusionOpMaker : public Conv2DOpMaker {
  protected:
   void Apply() override {
@@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
         "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
         "'relux' , 'tanh', 'band_pass'")
         .SetDefault("relu");
+    AddAttr<std::vector<int>>(
+        "split_channels",
+        "When `split_channels` are set, there will be multiple outputs, the "
+        "output size is equal to the number of `split_channels`.")
+        .SetDefault({});
+    AddOutput("Outputs",
+              "This Outputs is used when setting `split_channels`."
+              "Usually used to fuse conv with same input and same filter size, "
+              "padding, stride, dilation size.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("AlgoCache",
+             "The cache of convolution algorithm, a RAW type variable.")
+        .AsDispensable();
+    AddAttr<int>(
+        "search_times",
+        "The number of exhaustive search times for convolution algorithm.")
+        .SetDefault(-1);
   }
 };
+
+class Conv2DFusionOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of ConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of ConvOp should not be null.");
+    auto in_dims = ctx->GetInputDim("Input");
+    auto filter_dims = ctx->GetInputDim("Filter");
+
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::vector<int> dilations =
+        ctx->Attrs().Get<std::vector<int>>("dilations");
+
+    std::vector<int64_t> oshape({in_dims[0], filter_dims[0]});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      oshape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                      dilations[i], paddings[i], strides[i]));
+    }
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of ConvOp should not be null.");
+    ctx->SetOutputDim("Output", framework::make_ddim(oshape));
+    std::vector<int> channels =
+        ctx->Attrs().Get<std::vector<int>>("split_channels");
+    if (channels.size()) {
+      PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
+                     "Output(Outputs) of ConvOp should not be null.");
+      std::vector<framework::DDim> oshapes;
+      oshapes.reserve(channels.size());
+      for (size_t i = 0; i < channels.size(); ++i) {
+        oshapes.push_back({oshape[0], channels[i], oshape[2], oshape[3]});
+      }
+      ctx->SetOutputsDim("Outputs", oshapes);
+    }
+  }
+};
+
 // TODO(qingqing): add gradient operator for conv2d_fusion
 
 }  // namespace operators
@@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker,
-                  ops::ConvOpInferVarType, paddle::framework::EmptyGradOpMaker);
+                  ops::Conv2DFusionOpInferShape, ops::ConvOpInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index 3235ad52b9..d8b997cca6 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -16,13 +16,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
-DECLARE_uint64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
+DEFINE_int64(cudnn_exhaustive_search_times, -1,
+             "Exhaustive search times for cuDNN convolution, "
+             "defalut is 1, only search once.");
 
 namespace paddle {
 namespace operators {
 
-#if CUDNN_VERSION >= 7001
+#if CUDNN_VERSION >= 7100
 using Tensor = framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
@@ -117,41 +118,60 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
           workspace_size_limit, &algo));
       VLOG(3) << "cuDNN forward algo " << algo;
     } else {
+      auto search_func = [&]() {
+        int returned_algo_count;
+        std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
+            fwd_perf_stat;
+        auto cudnn_find_func = [&](void* cudnn_workspace) {
+          CUDNN_ENFORCE(
+              platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+                  handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+                  filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
+                  kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
+        };
+        workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
+        VLOG(3) << "Perf result: (algo: stat, time, memory)";
+        for (int i = 0; i < returned_algo_count; ++i) {
+          const auto& stat = fwd_perf_stat[i];
+          VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " "
+                  << stat.memory;
+        }
+        return fwd_perf_stat[0].algo;
+      };
       AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
-      if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
+      int search_times = ctx.Attr<int>("search_times");
+      search_times = std::max(
+          static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
+      if (search_times > 0) {
+        // The searched algo will be cached by `search_times` times for
+        // different input dimension. For other dimensions, select the algo
+        // of closest area.
+        auto var_name = ctx.Inputs("AlgoCache")[0];
         algo_cache =
             ctx.scope()
-                .FindVar(kCUDNNFwdAlgoCache)
+                .FindVar(var_name)
                 ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0,
+                                        search_func);
       } else {
-        algo_cache =
-            const_cast<framework::Scope&>(ctx.scope())
-                .Var(kCUDNNFwdAlgoCache)
-                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        // Cache searched algo in Var(kCUDNNFwdAlgoCache).
+        // all conv ops use the same kCUDNNFwdAlgoCache variable.
+        if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
+          algo_cache =
+              ctx.scope()
+                  .FindVar(kCUDNNFwdAlgoCache)
+                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        } else {
+          // TODO(qingqing) remove const_cast
+          algo_cache =
+              const_cast<framework::Scope*>(ctx.scope().parent())
+                  ->Var(kCUDNNFwdAlgoCache)
+                  ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+        }
+        algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings,
+                                        dilations, 0, search_func);
       }
-      algo = algo_cache->GetAlgorithm(
-          x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
-            int returned_algo_count;
-            std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
-                fwd_perf_stat;
-            auto cudnn_find_func = [&](void* cudnn_workspace) {
-              CUDNN_ENFORCE(
-                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                      handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-                      filter_data, cudnn_conv_desc, cudnn_output_desc,
-                      output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                      fwd_perf_stat.data(), cudnn_workspace,
-                      workspace_size_limit));
-            };
-            workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
-            VLOG(3) << "Perf result: (algo: stat, time, memory)";
-            for (int i = 0; i < returned_algo_count; ++i) {
-              const auto& stat = fwd_perf_stat[i];
-              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
-                      << " " << stat.memory;
-            }
-            return fwd_perf_stat[0].algo;
-          });
       VLOG(3) << "choose algo " << algo;
     }
 
@@ -161,9 +181,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                       "workspace_size to be allocated exceeds the limit");
 
-    if ((activation == "identity") &&
-        (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
-        (!residual)) {
+    if ((activation == "identity") && (!residual)) {
       // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
       // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
       // But test in some case, the speed is slower, change to use
@@ -197,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
     }
+    std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
+    if (channels.size()) {
+      auto outs = ctx.MultiOutput<framework::Tensor>("Outputs");
+      if (x_dims[0] == 1) {
+        // share data with Output
+        framework::Tensor t;
+        t.ShareDataWith(*output);
+        auto y_dims = output->dims();
+        t.Resize({y_dims[1], y_dims[2], y_dims[3]});
+        int s = 0;
+        for (size_t i = 0; i < channels.size(); ++i) {
+          int e = s + channels[i];
+          outs[i]->ShareDataWith(t.Slice(s, e));
+          outs[i]->Resize({x_dims[0], channels[i], y_dims[2], y_dims[3]});
+          s = e;
+        }
+      } else {
+        // TODO(qingiqng): do copy when batch size large than 1
+        PADDLE_THROW("Batch size greater than 1 is Unsupported");
+      }
+    }
   }
 };
 #endif
@@ -204,7 +243,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-#if CUDNN_VERSION >= 7001
+#if CUDNN_VERSION >= 7100
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
                         ops::CUDNNConvFusionOpKernel<double>);
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 8c116c4abf..03d9d466c3 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -12,6 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include <unordered_map>
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/conv_op.h"
@@ -68,13 +69,22 @@ inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format,
   }
 }
 
-template <typename T>
+template <typename T, typename K>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
+    bool is_INT8 =
+        std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+    if (!is_INT8) {
+      ComputeFP32(ctx);
+    } else {
+      ComputeINT8(ctx);
+    }
+  }
 
+  void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
     const bool is_test = ctx.Attr<bool>("is_test");
 
     auto& dev_ctx =
@@ -274,6 +284,271 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     output->set_layout(DataLayout::kMKLDNN);
     output->set_format(GetMKLDNNFormat(*dst_memory_p));
   }
+  void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
+    auto* output = ctx.Output<Tensor>("Output");
+
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
+                   "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
+                   "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
+    if (bias) {
+      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
+                         bias->format() != memory::format::format_undef,
+                     "Wrong layout/format set for Bias tensor");
+      PADDLE_ENFORCE(bias->dims().size() == 1,
+                     "Bias must only have 1 dimension, i.e. X");
+    }
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
+
+    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+
+    bool is_conv3d = strides.size() == 3U;
+    // TODO(tpatejko): add support for dilation
+    PADDLE_ENFORCE(
+        is_conv3d
+            ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
+                  dilations[2] == 1
+            : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        "dilation in convolution is not implemented yet");
+
+    PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently");
+
+    const T* input_data = input->data<T>();
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    int g = std::max(groups, 1);
+    GetWeightsTz(weights_tz, g, is_conv3d);
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+
+    mkldnn::memory::data_type src_dt =
+        paddle::framework::ToMKLDNNDataType(input->type());
+    auto dst_dt = fuse_relu ? paddle::framework::ToMKLDNNDataType(
+                                  framework::DataTypeTrait<uint8_t>::DataType)
+                            : paddle::framework::ToMKLDNNDataType(
+                                  framework::DataTypeTrait<int8_t>::DataType);
+
+    if (force_fp32_output) {
+      dst_dt = paddle::framework::ToMKLDNNDataType(
+          framework::DataTypeTrait<float>::DataType);
+    }
+
+    // Get unique name for storing MKLDNN primitives
+    std::string key;
+    key.reserve(MaxKeyLength);
+    platform::ConvMKLDNNHandler::AppendKey(
+        &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
+        input->format(), dst_dt, ctx.op().Output("Output"));
+    const std::string key_conv_pd = key + "@conv_pd";
+
+    std::shared_ptr<mkldnn::convolution_forward> conv_p = nullptr;
+    std::shared_ptr<mkldnn::memory> src_memory_p = nullptr;
+    std::shared_ptr<mkldnn::memory> user_src_memory_p = nullptr;
+    std::shared_ptr<mkldnn::memory> dst_memory_p = nullptr;
+    std::vector<primitive> pipeline;
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
+        nullptr;
+    std::shared_ptr<platform::ConvMKLDNNHandler> handler = nullptr;
+
+    auto prim_key = key + "@conv_p";
+    auto dst_key = key + "@dst_mem_p";
+    auto src_key = key + "@src_mem_p";
+    auto user_src_key = key + "@user_src_mem_p";
+    auto src_reorder_key = key + "@src_mem_preorder_p";
+    conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
+        dev_ctx.GetBlob(prim_key));
+    if (conv_p == nullptr || !is_test) {
+      const K* filter_data = filter->data<K>();
+      auto scale_in_data = ctx.Attr<float>("Scale_in");
+      auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
+      auto scale_out_data =
+          force_fp32_output ? 1.0f : ctx.Attr<float>("Scale_out");
+
+      bool is_multi_channel = scale_weights_data.size() > 1;
+
+      int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0]
+                                            : (weights_tz)[0])
+                                   : 1;
+      std::vector<float> output_shift_scale(count);
+#pragma omp parallel for if (count > 1)
+      for (int i = 0; i < count; i++) {
+        if (scale_weights_data[i] == 0.0)
+          output_shift_scale[i] =
+              scale_out_data;  // weights data will contain 0
+                               // in some models, then weights
+                               // scale couldn't be calculated
+        else
+          output_shift_scale[i] =
+              scale_out_data / (scale_in_data * scale_weights_data[i]);
+      }
+
+      auto user_src_md =
+          platform::MKLDNNMemDesc({src_tz}, src_dt, input->format());
+      auto user_weights_md = platform::MKLDNNMemDesc(
+          {weights_tz}, platform::MKLDNNGetDataType<K>(),
+          ((g) == 1) ? mkldnn::memory::format::oihw
+                     : mkldnn::memory::format::goihw);
+
+      /* create memory descriptor for convolution without specified format
+      * ('any') which lets a primitive (convolution in this case) choose
+      * the memory format preferred for best performance
+      */
+      std::string data_format = ctx.Attr<std::string>("data_format");
+      auto chosen_memory_format =
+          platform::data_format_to_memory_format(data_format);
+
+      std::vector<int> bias_tz;
+
+      auto src_md =
+          platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format);
+      auto weights_md = platform::MKLDNNMemDesc(
+          weights_tz, memory::data_type::s8, chosen_memory_format);
+      auto dst_md =
+          platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
+      // create a conv primitive descriptor and save it for usage in backward
+      if (bias) {
+        bias_tz = paddle::framework::vectorize2int(bias->dims());
+        auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32,
+                                               memory::format::x);
+        conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
+                                       strides, paddings, mkldnn_engine,
+                                       fuse_relu, output_shift_scale, is_test);
+      } else {
+        conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
+                                       paddings, mkldnn_engine, fuse_relu,
+                                       output_shift_scale, is_test);
+      }
+      // Save conv_pd/src_memory/weights_memory for backward pass
+      dev_ctx.SetBlob(key_conv_pd, conv_pd);
+
+      handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
+                                                    mkldnn_engine, key));
+
+      // create mkldnn memory from input tensors (data/weights)
+      user_src_memory_p =
+          handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
+      auto user_weights_memory_p = handler->AcquireWeightsMemory(
+          user_weights_md, to_void_cast<K>(filter_data));
+
+      // create reorder primitive if the input format is not the preferred one
+      src_memory_p =
+          handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
+
+      std::shared_ptr<mkldnn::memory> weights_memory_p;
+      int mask_reorder =
+          is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0;
+      weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive(
+          user_weights_memory_p, pipeline, is_test, true, scale_weights_data,
+          mask_reorder);
+
+      if (!force_fp32_output) {
+        if (fuse_relu) {
+          dst_memory_p = platform::SetDstMemory<uint8_t>(ctx, output, handler);
+        } else {
+          dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
+        }
+      } else {
+        dst_memory_p = platform::SetDstMemory<float>(ctx, output, handler);
+      }
+
+      // create convolution op primitive
+      auto scale_bias_key = key + "@scale_bias";
+      if (bias) {
+        const float* bias_data = bias->data<float>();
+        auto user_bias_md = platform::MKLDNNMemDesc(
+            {bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x);
+        auto user_bias_memory_p = handler->AcquireBiasMemory(
+            user_bias_md, to_void_cast<float>(bias_data));
+        std::shared_ptr<mkldnn::memory> bias_memory_p;
+        int mask_reorder = is_multi_channel ? 1 << 0 : 1;
+        int count =
+            is_multi_channel
+                ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0])
+                : 1;
+        std::vector<float> scale_bias_data(count);
+#pragma omp parallel for if (count > 1)
+        for (int i = 0; i < count; i++) {
+          scale_bias_data[i] = scale_in_data * scale_weights_data[i];
+        }
+        bias_memory_p = handler->AcquireBiasMemoryFromPrimitive(
+            user_bias_memory_p, pipeline, is_test, true, scale_bias_data,
+            mask_reorder);
+        conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
+                                             bias_memory_p, dst_memory_p);
+      } else {
+        conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
+                                             dst_memory_p);
+      }
+
+      // push primitive to stream and wait until it's executed
+      pipeline.push_back(*conv_p);
+    } else {
+      auto src_memory_reorder_p = std::static_pointer_cast<mkldnn::memory>(
+          dev_ctx.GetBlob(src_reorder_key));
+      src_memory_p =
+          std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_key));
+      if (src_memory_reorder_p) {
+        user_src_memory_p = std::static_pointer_cast<mkldnn::memory>(
+            dev_ctx.GetBlob(user_src_key));
+        user_src_memory_p->set_data_handle(to_void_cast<T>(input_data));
+      } else if (src_memory_p) {
+        src_memory_p->set_data_handle(to_void_cast<T>(input_data));
+      }
+
+      dst_memory_p =
+          std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(dst_key));
+      conv_pd =
+          std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
+              dev_ctx.GetBlob(key_conv_pd));
+      if (conv_pd) {
+        handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
+                                                      mkldnn_engine, key));
+      }
+      if (!force_fp32_output) {
+        if (fuse_relu) {
+          dst_memory_p =
+              platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler);
+        } else {
+          dst_memory_p =
+              platform::SetDstMemoryHandler<int8_t>(ctx, output, handler);
+        }
+      } else {
+        dst_memory_p =
+            platform::SetDstMemoryHandler<float>(ctx, output, handler);
+      }
+      if (src_memory_reorder_p) {
+        pipeline.push_back(*src_memory_reorder_p);
+      }
+      pipeline.push_back(*conv_p);
+    }
+    // push primitive to stream and wait until it's executed
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetMKLDNNFormat(*dst_memory_p));
+  }
 
  private:
   mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
@@ -301,6 +576,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     return conv_attr;
   }
 
+  mkldnn::primitive_attr CreatePostOps(
+      bool fuse_relu, const std::vector<float> output_shift_scale) const {
+    mkldnn::primitive_attr conv_attr;
+    mkldnn::post_ops post_operations;
+    int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
+    conv_attr.set_output_scales(mask, output_shift_scale);
+    if (fuse_relu) {
+      constexpr float scale = 1.0f;
+      constexpr float negative_slope = 0.0f;
+      constexpr float placeholder = 1.0f;  // beta
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                     negative_slope, placeholder);
+    }
+    conv_attr.set_post_ops(post_operations);
+    return conv_attr;
+  }
+
   std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
   ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                        const memory::desc& dst, const std::vector<int>& strides,
@@ -325,6 +617,33 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         p_conv_pd);
   }
 
+  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                       const memory::desc& dst, const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const mkldnn::engine& engine, const bool fuse_relu,
+                       const std::vector<float> output_shift_scale,
+                       bool is_test) const {
+    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
+                               : mkldnn::prop_kind::forward_training;
+
+    auto conv_desc = mkldnn::convolution_forward::desc(
+        propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims,
+        padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr conv_attr =
+        CreatePostOps(fuse_relu, output_shift_scale);
+
+    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
+        conv_desc, conv_attr, engine);
+
+    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+        p_conv_pd);
+  }
+
   std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
   ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                        const memory::desc& bias, const memory::desc& dst,
@@ -349,6 +668,34 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
         p_conv_pd);
   }
+
+  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                       const memory::desc& bias, const memory::desc& dst,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const mkldnn::engine& engine, const bool fuse_relu,
+                       const std::vector<float> output_shift_scale,
+                       bool is_test) const {
+    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
+                               : mkldnn::prop_kind::forward_training;
+
+    auto conv_desc = mkldnn::convolution_forward::desc(
+        propagation, mkldnn::convolution_direct, src, weights, bias, dst,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr conv_attr =
+        CreatePostOps(fuse_relu, output_shift_scale);
+
+    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
+        conv_desc, conv_attr, engine);
+
+    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+        p_conv_pd);
+  }
 };
 
 template <typename T>
@@ -555,7 +902,17 @@ namespace ops = paddle::operators;
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
                                     ::paddle::platform::CPUPlace, FP32,
                                     ops::kConvMKLDNNFP32,
-                                    ops::ConvMKLDNNOpKernel<float>);
+                                    ops::ConvMKLDNNOpKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, U8,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNOpKernel<uint8_t, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, S8,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNOpKernel<int8_t, float>);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
                                     ::paddle::platform::CPUPlace, FP32,
@@ -565,7 +922,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
                                     ::paddle::platform::CPUPlace, FP32,
                                     ops::kConvMKLDNNFP32,
-                                    ops::ConvMKLDNNOpKernel<float>);
+                                    ops::ConvMKLDNNOpKernel<float, float>);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN,
                                     ::paddle::platform::CPUPlace, FP32,
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 8e0d282495..c8b33b8932 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 #endif
 
   auto input_data_type = ctx.Input<Tensor>("Input")->type();
-  auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
-  PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
-                    "input and filter data type should be consistent");
-
+  if (input_data_type != framework::proto::VarType::INT8 &&
+      input_data_type != framework::proto::VarType::UINT8) {
+    auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
+    PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
+                      "input and filter data type should be consistent");
+  }
   if (input_data_type == framework::proto::VarType::FP16) {
     PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
                       "float16 can only be used when CUDNN is used");
@@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() {
                 "whenever convolution output is as an input to residual "
                 "connection.")
       .SetDefault(false);
+  AddAttr<float>("Scale_in",
+                 "Scale_in to be used for int8 input data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Scale_out",
+                 "Scale_out to be used for int8 output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Scale_in_eltwise",
+                 "Scale_in_eltwise to be used for int8 eltwise input data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<std::vector<float>>("Scale_weights",
+                              "Scale_weights to be used for int8 weights data."
+                              "Only used with MKL-DNN INT8.")
+      .SetDefault({1.0f});
+  AddAttr<bool>("force_fp32_output",
+                "(bool, default false) Force INT8 kernel output FP32, only "
+                "used in MKL-DNN INT8")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
@@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() {
       "Defaults to \"NHWC\". Specify the data format of the output data, "
       "the input will be transformed automatically. ")
       .SetDefault("AnyLayout");
+  AddAttr<bool>("force_fp32_output",
+                "(bool, default false) Only used in mkldnn INT8 kernel")
+      .SetDefault(false);
   // TODO(dzhwinter): need to registered layout transform function
   AddAttr<int>("workspace_size_MB",
                "Only used in cudnn kernel. workspace size for cudnn, in MB, "
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 2519f5e7ac..eaa288edc5 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
@@ -30,6 +29,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 constexpr int kConvMKLDNNFP32 = 1;
 constexpr int kConvMKLDNNINT8 = 2;
+constexpr int MaxKeyLength = 256;
 
 // Base convolution operator definations for other conv
 // like operators to reuse the implementation.
@@ -158,10 +158,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
     // to call the matrix multiplication interface.
     Tensor col_matrix;
     if (is_expand) {
-      auto tmp_allocation_ptr =
-          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-              framework::product(col_shape) * sizeof(T));
-      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
+      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
@@ -293,10 +290,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     // to call the matrix multiplication interface.
     Tensor col_matrix;
     if (is_expand) {
-      auto tmp_allocation_ptr =
-          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-              framework::product(col_shape) * sizeof(T));
-      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
+      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 2d7d33bd4f..cfc2cac7be 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) {
   }
   out->mutable_data<T>(out_dims, context.GetPlace());
   auto x_stride = framework::stride(x->dims());
-  auto out_stride = framework::stride(out->dims());
   auto offsets = GetOffsets(context);
   int64_t offset = 0;
   for (size_t i = 0; i < offsets.size(); ++i) {
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index f2ba75485c..1bf41ed948 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -22,239 +22,6 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
-struct CudnnRNNCache {
-  CudnnRNNCache() {
-    x_desc_ = NULL;
-    y_desc_ = NULL;
-    dx_desc_ = NULL;
-    dy_desc_ = NULL;
-  }
-  ~CudnnRNNCache() { release(); }
-
-  cudnnRNNDescriptor_t rnn_desc_;
-  cudnnTensorDescriptor_t *x_desc_;
-  cudnnTensorDescriptor_t *y_desc_;
-  cudnnTensorDescriptor_t *dx_desc_;
-  cudnnTensorDescriptor_t *dy_desc_;
-
-  cudnnTensorDescriptor_t hx_desc_;
-  cudnnTensorDescriptor_t cx_desc_;
-  cudnnTensorDescriptor_t hy_desc_;
-  cudnnTensorDescriptor_t cy_desc_;
-
-  cudnnTensorDescriptor_t dhx_desc_;
-  cudnnTensorDescriptor_t dcx_desc_;
-  cudnnTensorDescriptor_t dhy_desc_;
-  cudnnTensorDescriptor_t dcy_desc_;
-
-  cudnnTensorDescriptor_t output_x_desc_;
-  cudnnTensorDescriptor_t output_y_desc_;
-
-  cudnnDropoutDescriptor_t dropout_desc_;
-
-  size_t weights_size_;
-  cudnnFilterDescriptor_t w_desc_;
-  cudnnFilterDescriptor_t dw_desc_;
-
-  size_t workspace_size_;
-  size_t reserve_size_;
-  Tensor reserve_data_;
-  Tensor workspace_data_;
-
-  Tensor dropout_state_;
-
-  size_t max_length_;
-
-  float dropout_prob_;
-  bool is_bidirec_;
-
-  int batch_size_;
-  int input_size_;
-  int hidden_size_;
-  int num_layers_;
-  int seed_;
-
-  void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx,
-            size_t max_len, int batch_size, int input_size, int hidden_size,
-            int num_layers, float dropout_prob, bool is_bidirec, int seed,
-            int weight_numel) {
-    max_length_ = max_len;
-    batch_size_ = batch_size;
-    input_size_ = input_size;
-    hidden_size_ = hidden_size;
-    num_layers_ = num_layers;
-    dropout_prob_ = dropout_prob;
-    is_bidirec_ = is_bidirec;
-    seed_ = seed;
-
-    x_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    y_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
-    int dim_a[3];
-    int stride_a[3];
-
-    for (size_t i = 0; i < max_length_; ++i) {
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
-      dim_a[0] = batch_size_;
-      dim_a[1] = input_size_;
-      dim_a[2] = 1;
-
-      stride_a[0] = dim_a[2] * dim_a[1];
-      stride_a[1] = dim_a[2];
-      stride_a[2] = 1;
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-
-      dim_a[0] = batch_size_;
-      dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
-      dim_a[2] = 1;
-
-      stride_a[0] = dim_a[2] * dim_a[1];
-      stride_a[1] = dim_a[2];
-      stride_a[2] = 1;
-
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-          dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    }
-
-    dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
-    dim_a[1] = batch_size_;
-    dim_a[2] = hidden_size_;
-
-    stride_a[0] = dim_a[2] * dim_a[1];
-    stride_a[1] = dim_a[2];
-    stride_a[2] = 1;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
-
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
-
-    size_t state_size;
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
-        dropout_state_.Resize({static_cast<int64_t>(state_size)}));
-    auto *dropout_state_data =
-        dropout_state_.mutable_data<uint8_t>(ctx.GetPlace());
-    CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
-        dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
-        seed_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
-
-#if CUDNN_VERSION >= 6000
-    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
-        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
-        CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
-#else
-    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_DATA_FLOAT));
-#endif
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
-        handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
-
-    PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
-                      "cudnn lstm weight size should be SAME");
-    int dim_w[3];
-    dim_w[0] = weights_size_ / sizeof(float);
-    dim_w[1] = 1;
-    dim_w[2] = 1;
-    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
-        w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
-    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
-        dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
-        handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
-    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
-        handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
-
-    reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
-    reserve_data_.mutable_data<uint8_t>(ctx.GetPlace());
-
-    workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
-    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
-  }
-
-  void release() {
-    for (size_t i = 0; i < max_length_; ++i) {
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
-      CUDNN_ENFORCE(
-          platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
-    }
-
-    delete[] x_desc_;
-    delete[] y_desc_;
-    delete[] dx_desc_;
-    delete[] dy_desc_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
-
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
-
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
-  }
-};
-
 template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
  public:
@@ -315,9 +82,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 
       auto input_w_numel = w->numel();
       auto batch_size = x->dims()[1];
-      cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
-                            hidden_size, num_layers, dropout_prob, is_bidirec,
-                            seed, input_w_numel);
+      cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size,
+                            input_size, hidden_size, num_layers, dropout_prob,
+                            is_bidirec, seed, input_w_numel);
     }
 
     auto run_seq_len = x->dims()[0];
@@ -380,7 +147,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
             ->GetMutable<CudnnRNNCache>();
 
     auto input_dims = input->dims();
-    auto weight_dims = weight->dims();
     auto init_h_dims = init_h->dims();
     auto init_c_dims = init_c->dims();
     in_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
new file mode 100644
index 0000000000..7f18b83927
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -0,0 +1,255 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+struct CudnnRNNCache {
+  CudnnRNNCache() {
+    x_desc_ = NULL;
+    y_desc_ = NULL;
+    dx_desc_ = NULL;
+    dy_desc_ = NULL;
+  }
+  ~CudnnRNNCache() { release(); }
+
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnTensorDescriptor_t *x_desc_;
+  cudnnTensorDescriptor_t *y_desc_;
+  cudnnTensorDescriptor_t *dx_desc_;
+  cudnnTensorDescriptor_t *dy_desc_;
+
+  cudnnTensorDescriptor_t hx_desc_;
+  cudnnTensorDescriptor_t cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_;
+  cudnnTensorDescriptor_t cy_desc_;
+
+  cudnnTensorDescriptor_t dhx_desc_;
+  cudnnTensorDescriptor_t dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_;
+  cudnnTensorDescriptor_t dcy_desc_;
+
+  cudnnTensorDescriptor_t output_x_desc_;
+  cudnnTensorDescriptor_t output_y_desc_;
+
+  cudnnDropoutDescriptor_t dropout_desc_;
+
+  size_t weights_size_;
+  cudnnFilterDescriptor_t w_desc_;
+  cudnnFilterDescriptor_t dw_desc_;
+
+  size_t workspace_size_;
+  size_t reserve_size_;
+  framework::Tensor reserve_data_;
+  framework::Tensor workspace_data_;
+
+  framework::Tensor dropout_state_;
+
+  size_t max_length_;
+
+  float dropout_prob_;
+  bool is_bidirec_;
+
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  int seed_;
+
+  void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len,
+            int batch_size, int input_size, int hidden_size, int num_layers,
+            float dropout_prob, bool is_bidirec, int seed, int weight_numel) {
+    max_length_ = max_len;
+    batch_size_ = batch_size;
+    input_size_ = input_size;
+    hidden_size_ = hidden_size;
+    num_layers_ = num_layers;
+    dropout_prob_ = dropout_prob;
+    is_bidirec_ = is_bidirec;
+    seed_ = seed;
+
+    x_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    y_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    int dim_a[3];
+    int stride_a[3];
+
+    for (size_t i = 0; i < max_length_; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
+      dim_a[0] = batch_size_;
+      dim_a[1] = input_size_;
+      dim_a[2] = 1;
+
+      stride_a[0] = dim_a[2] * dim_a[1];
+      stride_a[1] = dim_a[2];
+      stride_a[2] = 1;
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+
+      dim_a[0] = batch_size_;
+      dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
+      dim_a[2] = 1;
+
+      stride_a[0] = dim_a[2] * dim_a[1];
+      stride_a[1] = dim_a[2];
+      stride_a[2] = 1;
+
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    }
+
+    dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
+    dim_a[1] = batch_size_;
+    dim_a[2] = hidden_size_;
+
+    stride_a[0] = dim_a[2] * dim_a[1];
+    stride_a[1] = dim_a[2];
+    stride_a[2] = 1;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
+
+    size_t state_size;
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
+        dropout_state_.Resize({static_cast<int64_t>(state_size)}));
+    auto *dropout_state_data = dropout_state_.mutable_data<uint8_t>(place);
+    CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
+        dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
+        seed_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
+
+#if CUDNN_VERSION >= 6000
+    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
+#else
+    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
+        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_DATA_FLOAT));
+#endif
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
+
+    PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
+                      "cudnn lstm weight size should be SAME");
+    int dim_w[3];
+    dim_w[0] = weights_size_ / sizeof(float);
+    dim_w[1] = 1;
+    dim_w[2] = 1;
+    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+        w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+        dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
+        handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
+
+    reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
+    reserve_data_.mutable_data<uint8_t>(place);
+
+    workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
+    workspace_data_.mutable_data<uint8_t>(place);
+  }
+
+  void release() {
+    for (size_t i = 0; i < max_length_; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
+    }
+
+    delete[] x_desc_;
+    delete[] y_desc_;
+    delete[] dx_desc_;
+    delete[] dy_desc_;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
+
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h
index 999fdcff90..7c0fda4169 100644
--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <array>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h
index a800d5df0a..8660bc219c 100644
--- a/paddle/fluid/operators/detail/safe_ref.h
+++ b/paddle/fluid/operators/detail/safe_ref.h
@@ -25,7 +25,7 @@ namespace detail {
  */
 template <typename T, typename... ARGS>
 inline T& Ref(T* ptr, ARGS&&... args) {
-  PADDLE_ENFORCE(ptr != nullptr, args...);
+  PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...));
   return *ptr;
 }
 
diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h
index 0b7c470fe7..94419d1f9a 100644
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -27,8 +27,8 @@ struct StridedMemcpyFunctor;
 template <typename T>
 struct StridedMemcpyFunctor<T, 0> {
   void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<0> src_stride, framework::Dim<0> dst_dim,
-                  framework::Dim<0> dst_stride, T* dst) const {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
     auto place = dev_ctx.GetPlace();
     if (platform::is_cpu_place(place)) {
       auto& cpu_place = boost::get<platform::CPUPlace>(place);
@@ -50,18 +50,18 @@ struct StridedMemcpyFunctor<T, 0> {
 template <typename T>
 struct StridedMemcpyFunctor<T, 1> {
   void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
-                  framework::Dim<1> dst_stride, T* dst) const {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
     auto place = dev_ctx.GetPlace();
     if (platform::is_cpu_place(place)) {
       auto& cpu_place = boost::get<platform::CPUPlace>(place);
-      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
+      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
     } else {
 #ifdef PADDLE_WITH_CUDA
       auto& gpu_place = boost::get<platform::CUDAPlace>(place);
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
-      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
+      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0],
                    cuda_ctx.stream());
 #else
       PADDLE_THROW("Paddle is not compiled with GPU");
@@ -73,19 +73,19 @@ struct StridedMemcpyFunctor<T, 1> {
 template <typename T, int Rank>
 struct StridedMemcpyFunctor {
   void operator()(const platform::DeviceContext& dev_ctx, const T* src,
-                  framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
-                  framework::Dim<Rank> dst_stride, T* dst) const {
-    for (int64_t i = 0; i < dst_dim.head; ++i) {
+                  const int64_t* src_stride, const int64_t* dst_dim,
+                  const int64_t* dst_stride, T* dst) const {
+    for (int64_t i = 0; i < dst_dim[0]; ++i) {
       StridedMemcpyFunctor<T, Rank - 1> func;
-      func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
-      src += src_stride.head;
-      dst += dst_stride.head;
+      func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst);
+      src += src_stride[0];
+      dst += dst_stride[0];
     }
   }
 };
 
 template <typename T>
-struct StridedCopyDimVisitor : public boost::static_visitor<void> {
+struct StridedCopyDimVisitor {
   StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
                         const framework::DDim& src_stride,
                         const framework::DDim& dst_stride, T* dst)
@@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor<void> {
         dst_stride_(dst_stride),
         dst_(dst) {}
 
-  template <typename Dim>
-  void operator()(Dim dst_dim) const {
-    Dim src_stride = boost::get<Dim>(src_stride_);
-    Dim dst_stride = boost::get<Dim>(dst_stride_);
-    constexpr int dim = Dim::dimensions;
-    StridedMemcpyFunctor<T, dim> functor;
-    functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
+  template <int D>
+  void operator()(const framework::Dim<D>& dst_dim) const {
+    StridedMemcpyFunctor<T, D> functor;
+    functor(dev_ctx_, src_, src_stride_.Get(), dst_dim.Get(), dst_stride_.Get(),
+            dst_);
   }
 
   const platform::DeviceContext& dev_ctx_;
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
index acd5993154..6337a4837a 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -148,7 +148,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     // blockx is multiple of 32.
     int blockx = std::min(
         static_cast<int64_t>(((feature_width * num_priors + 31) >> 5) << 5),
-        512L);
+        static_cast<int64_t>(512L));
     int gridx = (feature_width * num_priors + blockx - 1) / blockx;
     dim3 threads(blockx, 1);
     dim3 grids(gridx, feature_height);
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index fddd688401..a652d4d957 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
         "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");
 
     auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
-    auto gt_classes_dims = ctx->GetInputDim("GtClasses");
-    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
     auto im_info_dims = ctx->GetInputDim("ImInfo");
 
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 2c46803fd0..06e48f1262 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("Variances"),
                    "Input(Variances) shouldn't be null.");
 
-    auto scores_dims = ctx->GetInputDim("Scores");
-    auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-    auto anchors_dims = ctx->GetInputDim("Anchors");
-    auto variances_dims = ctx->GetInputDim("Variances");
-
     ctx->SetOutputDim("RpnRois", {-1, 4});
     ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
   }
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index dc6c3d5a66..0b8053e8d0 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
 
     auto anchor_dims = ctx->GetInputDim("Anchor");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
     auto im_info_dims = ctx->GetInputDim("ImInfo");
     PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
                       "The rank of Input(Anchor) must be 2.");
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index eab4297c73..8a25d57e61 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -7,56 +7,52 @@ if(WITH_GRPC)
 else()
     set(cc_generic_services "true")
 endif()
-configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
+configure_file(send_recv.proto.in ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto @ONLY)
 
+# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
 if(WITH_GRPC)
-  grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
-      PROTO send_recv.proto 
+  set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc)
+  grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
+        request_handler_impl.cc rpc_client.cc rpc_server.cc
+        variable_response.cc
+        collective_client.cc collective_server.cc
+        ${GRPC_SRCS}
+      PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto 
       DEPS lod_tensor selected_rows_functor memory)
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set(RPC_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
 
-  cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
-    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL)
-
-  cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
-
-  cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
-
-  if(WITH_GPU)
-  cc_test(collective_server_test SRCS collective_server_test.cc 
-      DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
-      selected_rows_functor  scope math_function SERIAL)
-  endif()
+  cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc 
+    DEPS ${RPC_DEPS} scope profiler math_function SERIAL)
 
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 else()
   set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
       brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc
       collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-  brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
-      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc
-    PROTO send_recv.proto
+  set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc/server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
+  brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
+      request_handler_impl.cc rpc_client.cc rpc_server.cc
+      variable_response.cc
+      collective_client.cc collective_server.cc
+      ${BRPC_SRCS}
+    PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto
     DEPS lod_tensor selected_rows memory)
 
-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
-
-  set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor
-      proto_desc lookup_sparse_table_op snappystream snappy zlib)
-
-  cc_test(rpc_server_test SRCS rpc_server_test.cc
-      DEPS ${brpc_test_depends} SERIAL)
+  set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
+  cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
+      DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL)
+endif()
 
-  cc_test(brpc_serde_test SRCS brpc_serde_test.cc
-      DEPS ${brpc_test_depends} SERIAL)
 
-  if(WITH_GPU)
-  cc_test(collective_server_test SRCS collective_server_test.cc 
-      DEPS ${brpc_test_depends} selected_rows_functor  scope math_function SERIAL)
-  endif()
+cc_test(rpc_server_test SRCS rpc_server_test.cc
+    DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL)
+cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
+cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
+if(WITH_GPU)
+    cc_test(collective_server_test SRCS collective_server_test.cc 
+        DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+        selected_rows_functor  scope math_function SERIAL)
 endif()
diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
similarity index 99%
rename from paddle/fluid/operators/distributed/brpc_client.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_client.cc
index 62e32977b8..87bdb83503 100644
--- a/paddle/fluid/operators/distributed/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/distributed/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
similarity index 97%
rename from paddle/fluid/operators/distributed/brpc_client.h
rename to paddle/fluid/operators/distributed/brpc/brpc_client.h
index 80cc81bff3..2066ade8a5 100644
--- a/paddle/fluid/operators/distributed/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h
@@ -31,10 +31,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
similarity index 97%
rename from paddle/fluid/operators/distributed/brpc_rdma_pool.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
index e1be5673df..d5c614001e 100644
--- a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
@@ -14,7 +14,7 @@
 
 #ifdef PADDLE_WITH_BRPC_RDMA
 
-#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
 #include "brpc/channel.h"
 #include "brpc/rdma/rdma_helper.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
similarity index 100%
rename from paddle/fluid/operators/distributed/brpc_rdma_pool.h
rename to paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
index e4604db3a3..49e048f07a 100644
--- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
similarity index 96%
rename from paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
rename to paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
index ffaf442224..a5bdc331eb 100644
--- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
similarity index 97%
rename from paddle/fluid/operators/distributed/brpc_serde_test.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
index 2a2dc72150..b902d3db48 100644
--- a/paddle/fluid/operators/distributed/brpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
similarity index 98%
rename from paddle/fluid/operators/distributed/brpc_server.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_server.cc
index 78d41aeac5..cbe0bd09c7 100644
--- a/paddle/fluid/operators/distributed/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
 namespace sendrecv {
diff --git a/paddle/fluid/operators/distributed/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h
similarity index 95%
rename from paddle/fluid/operators/distributed/brpc_server.h
rename to paddle/fluid/operators/distributed/brpc/brpc_server.h
index 85a7ad0dfe..78bbe5adc0 100644
--- a/paddle/fluid/operators/distributed/brpc_server.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_server.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <string>
 
 #include "brpc/server.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/brpc_variable_response.cc
rename to paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
index 75306d7233..eb78917ad2 100644
--- a/paddle/fluid/operators/distributed/brpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 //
 
-#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
similarity index 97%
rename from paddle/fluid/operators/distributed/brpc_variable_response.h
rename to paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
index b0b91a42a0..6282f08a72 100644
--- a/paddle/fluid/operators/distributed/brpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h
index 53b03c531a..6a3a450a1f 100644
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ b/paddle/fluid/operators/distributed/collective_client.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
 DECLARE_int32(rpc_deadline);
diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h
index a23dc18b4d..03c688a78e 100644
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ b/paddle/fluid/operators/distributed/collective_server.h
@@ -23,7 +23,7 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
index 0a9c69e393..46c761000c 100644
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
-#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/operators/distributed/collective_client.h"
 #include "paddle/fluid/operators/distributed/collective_server.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -52,12 +52,12 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
   framework::Scope* scope = new framework::Scope();
   framework::Variable* var = scope->Var("var1");
   auto* slr = var->GetMutable<framework::SelectedRows>();
-  slr->set_height(1000);
+  slr->set_height(20000);
 
   auto* tensor = slr->mutable_value();
   auto* rows = slr->mutable_rows();
 
-  tensor->Resize(framework::make_ddim({3, 5}));
+  tensor->Resize(framework::make_ddim({20000, 1024}));
   tensor->mutable_data<float>(place);
 
   paddle::operators::math::set_constant(ctx, tensor, 32.7);
@@ -83,6 +83,7 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
 }
 
 TEST(PREFETCH, GPU) {
+  setenv("FLAGS_max_body_size", "2147483647", 1);
   platform::CUDAPlace place;
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& ctx = *pool.Get(place);
diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/distributed/distributed.h
similarity index 80%
rename from paddle/fluid/operators/detail/macros.h
rename to paddle/fluid/operators/distributed/distributed.h
index 6f4a15caa5..3a9f922598 100644
--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/distributed/distributed.h
@@ -18,15 +18,15 @@
 
 #ifdef PADDLE_WITH_GRPC
 
-#include "paddle/fluid/operators/distributed/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
 #define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
 #define RPCCLIENT_T paddle::operators::distributed::GRPCClient
 
 #else  // PADDLE_WITH_GRPC
 
-#include "paddle/fluid/operators/distributed/brpc_client.h"
-#include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
 #define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
 #define RPCCLIENT_T paddle::operators::distributed::BRPCClient
 
diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/operators/distributed/distributed_pb.h
new file mode 100644
index 0000000000..f1c662be9a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/distributed_pb.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+
+#ifdef PADDLE_WITH_GRPC
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#else  // PADDLE_WITH_GRPC
+
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#endif  // PADDLE_WITH_GRPC
+
+#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
index d192f54ee0..c2cb0d7f04 100644
--- a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.
 
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
similarity index 100%
rename from paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h
rename to paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
similarity index 99%
rename from paddle/fluid/operators/distributed/grpc_client.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 8c54159a41..7875c16c3c 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
similarity index 98%
rename from paddle/fluid/operators/distributed/grpc_client.h
rename to paddle/fluid/operators/distributed/grpc/grpc_client.h
index 01bf46cc31..fa77d21257 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -39,10 +39,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
similarity index 96%
rename from paddle/fluid/operators/distributed/grpc_serde.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index a9dea9cfd2..6df4fd36f9 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/port.h"
diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
similarity index 93%
rename from paddle/fluid/operators/distributed/grpc_serde.h
rename to paddle/fluid/operators/distributed/grpc/grpc_serde.h
index 16f5293b0e..c9a57beb3a 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
@@ -27,8 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/port.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
similarity index 97%
rename from paddle/fluid/operators/distributed/grpc_serde_test.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
index 1936c2c623..749c1bf39a 100644
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
similarity index 99%
rename from paddle/fluid/operators/distributed/grpc_server.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_server.cc
index cda102e78d..08f777e279 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <limits>
 #include <string>
 
-#include "paddle/fluid/operators/distributed/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc_server.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
 
 using ::grpc::ServerAsyncResponseWriter;
 
diff --git a/paddle/fluid/operators/distributed/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h
similarity index 93%
rename from paddle/fluid/operators/distributed/grpc_server.h
rename to paddle/fluid/operators/distributed/grpc/grpc_server.h
index d2524f5e65..2fd3a7a740 100644
--- a/paddle/fluid/operators/distributed/grpc_server.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.h
@@ -29,11 +29,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/grpc_service.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_service.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/profiler.h"
 
diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
similarity index 98%
rename from paddle/fluid/operators/distributed/grpc_service.h
rename to paddle/fluid/operators/distributed/grpc/grpc_service.h
index 537429b5fe..0b5c5151e6 100644
--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h
@@ -23,7 +23,7 @@
 #include <grpc++/impl/codegen/stub_options.h>
 #include <grpc++/impl/codegen/sync_stream.h>
 #include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 
 // NOTE: This method was originally created by tensorflow
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
similarity index 99%
rename from paddle/fluid/operators/distributed/grpc_variable_response.cc
rename to paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
index 76ad02b030..87e83ca53b 100644
--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
@@ -19,7 +19,7 @@
 #include <nccl.h>
 #endif
 
-#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
similarity index 89%
rename from paddle/fluid/operators/distributed/grpc_variable_response.h
rename to paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
index 89df07c92c..3ca1d89f75 100644
--- a/paddle/fluid/operators/distributed/grpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
@@ -22,13 +22,11 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
+#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index cf14538b1c..c63d653488 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
 
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
@@ -32,7 +32,7 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
@@ -117,6 +117,12 @@ static void MergeMultipleVarsIntoOneBySection(
   auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
   auto* out_tensor =
       scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
+
+  PADDLE_ENFORCE_GT(
+      out_tensor->numel(), 0,
+      "When calling this method, the LoDTensor's numel must larger than zero. "
+      "Please check LoDTensor::Resize has been called first.");
+
   auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());
 
   bool is_on_cpu_place = true;
@@ -138,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection(
 
       auto row_numel = dims[1];
 
-      for (size_t i = 0; i < dims[0]; ++i) {
+      for (int64_t i = 0; i < dims[0]; ++i) {
         auto id = ids_in_this_section[i];
         auto origin_id = id + abs_sections[section_idx];
         auto& offsets = id_to_offset[origin_id];
@@ -172,8 +178,9 @@ void prefetch(const std::string& id_name, const std::string& out_name,
               const std::vector<std::string>& table_names,
               const std::vector<std::string>& epmap,
               const std::vector<int>& height_sections,
-              const framework::ExecutionContext& context) {
-  auto& local_scope = context.scope().NewScope();
+              const framework::ExecutionContext& context,
+              const framework::Scope& scope) {
+  auto& local_scope = scope.NewScope();
 
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& cpu_ctx = *pool.Get(platform::CPUPlace());
@@ -190,11 +197,11 @@ void prefetch(const std::string& id_name, const std::string& out_name,
     out_var_names.push_back(out_name + "@" + epmap[i]);
   }
 
-  auto& id_tensor = local_scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
   std::vector<int64_t> ids_vector;
   if (platform::is_cpu_place(id_tensor.place())) {
     auto* id_data = id_tensor.data<int64_t>();
-    for (size_t i = 0; i < id_tensor.numel(); ++i) {
+    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
       ids_vector.push_back(id_data[i]);
     }
   } else {
@@ -202,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
     PADDLE_THROW("paddle is not compiled with CUDA!");
 #else
     auto cpu_place = platform::CPUPlace();
-    framework::Tensor cpu_tensor;
+    framework::LoDTensor cpu_tensor;
     auto* cpu_tensor_data =
         cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
     auto stream =
@@ -246,8 +253,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
   MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
                                     out_var_names, height_sections, splited_ids,
                                     context, &local_scope, &actual_ctx);
-
-  context.scope().DeleteScope(&local_scope);
+  scope.DeleteScope(&local_scope);
 }
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
index 53b0fbfb51..2f850a0332 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -27,7 +27,56 @@ void prefetch(const std::string& id_name, const std::string& out_name,
               const std::vector<std::string>& table_names,
               const std::vector<std::string>& epmap,
               const std::vector<int>& height_sections,
-              const framework::ExecutionContext& context);
+              const framework::ExecutionContext& context,
+              const framework::Scope& scope);
+
+template <typename T>
+void prefetch_with_reconstruct(const std::string& id_name,
+                               const std::string& out_name,
+                               const std::vector<std::string>& table_names,
+                               const std::vector<std::string>& epmap,
+                               const std::vector<int>& height_sections,
+                               const framework::ExecutionContext& context,
+                               const framework::Scope& scope,
+                               framework::LoDTensor* original) {
+  prefetch(id_name, out_name, table_names, epmap, height_sections, context,
+           scope);
+  auto& out = scope.FindVar(out_name)->Get<framework::LoDTensor>();
+  auto& ids = scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  auto* original_value = original->data<T>();
+  auto* out_value = out.data<T>();
+  size_t original_width = original->numel() / original->dims()[0];
+
+  bool is_on_cpu_place = true;
+  if (!platform::is_cpu_place(ids.place())) {
+    is_on_cpu_place = false;
+  }
+  if (is_on_cpu_place) {
+    for (int64_t i = 0; i < ids.numel(); i++) {
+      const T* out_rows = out_value + original_width * i;
+      T* original_row =
+          original_value + original_width * ids.data<int64_t>()[i];
+      std::memcpy(original_row, out_rows, original_width * sizeof(T));
+    }
+  } else {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& actual_ctx = *pool.Get(context.GetPlace());
+    for (int64_t i = 0; i < ids.numel(); i++) {
+      const T* out_rows = out_value + original_width * i;
+      T* original_row =
+          original_value + original_width * ids.data<int64_t>()[i];
+      auto stream =
+          static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(ids.place()), original_row,
+                   platform::CPUPlace(), out_rows, original_width * sizeof(T),
+                   stream);
+    }
+#endif
+  }
+}
 
 };  // namespace distributed
 };  // namespace operators
diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
index d2b0eb6ca6..27ca1f4edc 100644
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
@@ -84,7 +84,9 @@ class ProtoEncodeHelper {
   ~ProtoEncodeHelper() {
 #define REPLACE_ENFORCE_GLOG 1
     // Make sure callers didn't do operations that went over max_size promised
-    paddle::platform::throw_on_error(p_ <= limit_);
+    if (paddle::platform::is_error(p_ <= limit_)) {
+      paddle::platform::throw_on_error(p_ <= limit_);
+    }
 #undef REPLACE_ENFORCE_GLOG
   }
 
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 122619d41b..cc5b9c29a1 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+
 #include <fstream>
 #include <iostream>
 #include <limits>
 #include <string>
-
-#include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index c3dd459fc4..089ea623f1 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index 2637619f30..b39eef04d8 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -1,4 +1,3 @@
-
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
 the Apache License, Version 2.0 (the "License"); you may not use this file
 except in compliance with the License.
@@ -18,13 +17,8 @@ package sendrecv;
 option cc_generic_services = @cc_generic_services@;
 
 service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors.
-  // Send and recv only one tensor
-  // TODO(typhoonzero): add streaming API
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
-  // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  // pre-fetch variable by given variable name and Ids
   rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 
   rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
@@ -33,19 +27,12 @@ service SendRecvService {
   rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
 
-// VariableMessage is serialized paddle variable message.
-// It can be:
-// LoDTensor
-// SelectedRows
 enum VarType {
   LOD_TENSOR = 0;
   SELECTED_ROWS = 1;
   NCCL_ID = 2;
 }
 
-// NOTICE(gongwb):don't modify this proto if you are not
-//   not familar with how we serialize in sendrecvop_utils.h
-//   and deserilize it in  variable_response.h.
 message VariableMessage {
   enum Type {
     // Pod Types
@@ -62,21 +49,14 @@ message VariableMessage {
   string varname = 1;
   // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
   VarType type = 2;
-  // bool persistable is not needed for sending.
-  // tensor info:
   Type data_type = 3;
   repeated int64 dims = 4;
 
-  // lod details:
   int64 lod_level = 5;
   repeated LodData lod = 6;
-  // selected_rows height, aka. original dim0
   int64 slr_height = 7;
-  // tensor data
   bytes serialized = 8;
-  // selected_rows data
   bytes rows = 9;
-  // Look up table block execution output variable name.
   string out_varname = 10;
   // If 1, the ps server will start profiling, the ps
   // server stops profiling and generates a profile to /tmp/profile_ps_*
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 25e2f77fb7..e5c96507e9 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/platform/port.h"
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index 6a87178be5..5457101a5c 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 #include "paddle/fluid/platform/port.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index a4324f67bb..294cae5f44 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -25,7 +25,7 @@
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/distributed_pb.h"
 
 DECLARE_string(rpc_server_profile_path);
 
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index 3c0b7ff24f..a8bb597cbd 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -33,7 +33,7 @@ register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
 
 if(WITH_GPU AND NOT WIN32)
     set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
-    op_library(gen_nccl_id_op ${DISTRIBUTE_DEPS} nccl_common)
+    op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common)
 endif()
 
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
index a3b5ff8d17..a09bff351f 100644
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/string/printf.h"
 
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
index 8754856e14..7275ab201f 100644
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
index ef574ccdf4..80d712a0e0 100644
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 20870ea07e..629f364d71 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
index 86425aba8c..52b96d5f8e 100644
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 0399ff4100..48065437e3 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
index 8ca2877d8a..ae1b10c3b6 100644
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 
 #include "paddle/fluid/platform/profiler.h"
 
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index 0bf4bebbc9..e2c2147ab5 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
 #include "paddle/fluid/platform/profiler.h"
 
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h
index acc9b1e622..6676ecd1c8 100644
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.h
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.h
@@ -116,7 +116,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_THROW(
           "% should be LoDTensor or SelectedRows, but the received type is %s",
-          ctx.Inputs("Ids")[0], ids_var->Type().name());
+          ctx.Inputs("Ids")[0], framework::ToTypeName(ids_var->Type()));
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
index a73cb08eca..1598e1d0a4 100644
--- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index a8b8a67a11..7a7a3989c0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -83,7 +83,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
       z = ctx.Output<framework::LoDTensor>("Out");
     } else {
       PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
-                   x_var->Type().name());
+                   framework::ToTypeName(x_var->Type()));
     }
 
     z->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 41644d8cc1..fd2a98cb45 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                       "Rank of first input must >= rank of second input.");
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 75dbf1d8bf..3394082497 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel<T> {
     auto& expand_times = context.Attr<std::vector<int>>("expand_times");
     auto* out0 = context.Output<Tensor>("Out");
     Eigen::DSizes<int, Rank> bcast_dims;
-    auto x_dims = in0->dims();
     for (size_t i = 0; i < expand_times.size(); ++i) {
       bcast_dims[i] = expand_times[i];
     }
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 1ed8a2ddd1..38e57a41ed 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -146,7 +146,6 @@ class FCOpKernel : public framework::OpKernel<T> {
     auto w = ctx.Input<Tensor>("W");
     auto bias = ctx.Input<Tensor>("Bias");
     auto output = ctx.Output<Tensor>("Out");
-    auto in_dims = input->dims();
     auto w_dims = w->dims();
     auto out_dims = output->dims();
     int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 38cb33e790..c86430524e 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,68 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
 
 namespace paddle {
 namespace operators {
 
-class FillConstantInferShape : public framework::InferShapeBase {
+class FillConstantOp : public framework::OperatorWithKernel {
  public:
-  void operator()(framework::InferShapeContext *ctx) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
-    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    auto& shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
-};
-
-class FillConstantOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
-    auto value = Attr<float>("value");
-    auto force_cpu = Attr<bool>("force_cpu");
-
-    framework::Tensor *tensor = nullptr;
 
-    auto &out_var = *scope.FindVar(Output("Out"));
-
-    if (out_var.IsType<framework::LoDTensor>()) {
-      tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var.IsType<framework::SelectedRows>()) {
-      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else {
-      PADDLE_THROW(
-          "fill constant op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      tensor->mutable_data(cpu, data_type);
-    } else {
-      tensor->mutable_data(dev_place, data_type);
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    math::set_constant(dev_ctx, tensor, value);
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
   }
 };
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        boost::get<int>(op_desc.GetAttr("dtype")));
+    auto& out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetDataType(data_type);
+  }
 };
 
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -107,7 +79,13 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
-                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::FillConstantOpVarTypeInference);
+
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
+                  ops::FillConstantOpVarTypeInference,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
+                       ops::FillConstantKernel<double>,
+                       ops::FillConstantKernel<int64_t>,
+                       ops::FillConstantKernel<int>,
+                       ops::FillConstantKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc
new file mode 100644
index 0000000000..77027b5a87
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fill_constant_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
+                        ops::FillConstantKernel<double>,
+                        ops::FillConstantKernel<int64_t>,
+                        ops::FillConstantKernel<int>,
+                        ops::FillConstantKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
new file mode 100644
index 0000000000..417c5b4da6
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class FillConstantKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
+    auto value = ctx.Attr<float>("value");
+    auto force_cpu = ctx.Attr<bool>("force_cpu");
+
+    framework::Tensor *tensor = nullptr;
+
+    framework::Variable *out_var = ctx.OutputVar("Out");
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+      tensor->Resize(
+          framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(
+          framework::make_ddim(ctx.Attr<std::vector<int64_t>>("shape")));
+    } else {
+      PADDLE_THROW(
+          "fill constant op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+
+    if (force_cpu) {
+      tensor->mutable_data(platform::CPUPlace(), data_type);
+    } else {
+      tensor->mutable_data(ctx.GetPlace(), data_type);
+    }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(ctx.GetPlace());
+    math::set_constant(dev_ctx, tensor, value);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index a0397acab1..42ab8e9966 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -1,6 +1,10 @@
 include(operators)
-register_operators(EXCLUDES fusion_transpose_flatten_concat_op)
+register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op)
 if (WITH_GPU)
   op_library(fusion_transpose_flatten_concat_op)
   file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n")
+  if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
+      op_library(fusion_conv_inception_op)
+      file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n")
+  endif()
 endif()
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index f1466f17fe..c8282aefe4 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -241,15 +241,15 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   bool is_reverse = ctx.Attr<bool>("is_reverse");     \
   bool use_peepholes = ctx.Attr<bool>("use_peepholes");
 
-#define INIT_BASE_SIZES                       \
-  auto ids_dims = ids->dims();   /* T x M*/   \
-  auto ids_numel = ids->numel(); /* T x 1*/   \
-  auto wh_dims = wh->dims();     /* D x 4D*/  \
-  const int D = wh_dims[0];                   \
-  const int D2 = D * 2;                       \
-  const int D3 = D * 3;                       \
-  int64_t row_number = embeddings->dims()[0]; \
-  int64_t row_width = embeddings->dims()[1];  \
+#define INIT_BASE_SIZES                                      \
+  auto ids_dims = ids->dims();                   /* T x M*/  \
+  auto ids_numel = framework::product(ids_dims); /* T x 1*/  \
+  auto wh_dims = wh->dims();                     /* D x 4D*/ \
+  const int D = wh_dims[0];                                  \
+  const int D2 = D * 2;                                      \
+  const int D3 = D * 3;                                      \
+  int64_t row_number = embeddings->dims()[0];                \
+  int64_t row_width = embeddings->dims()[1];                 \
   const int D4 = wh_dims[1];
 
 #define INIT_BASE_INPUT_DATAS                                        \
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
new file mode 100644
index 0000000000..fe4c73f472
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -0,0 +1,194 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "Input W of FusedEmbeddingSeqPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Ids"),
+                   "Input Ids of FusedEmbeddingSeqPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output of FusedEmbeddingSeqPoolOp should not be null.");
+
+    auto table_dims = ctx->GetInputDim("W");
+    auto ids_dims = ctx->GetInputDim("Ids");
+    const std::string& combiner = ctx->Attrs().Get<std::string>("combiner");
+
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
+    PADDLE_ENFORCE_GE(ids_dims.size(), 1,
+                      "The dim size of the 'Ids' tensor must greater than 1.");
+    PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1,
+                      "The last dimension of the 'Ids' tensor must be 1.");
+    // we only support sum now
+    PADDLE_ENFORCE_EQ(combiner, "sum");
+
+    int64_t last_dim = table_dims[1];
+    for (int i = 1; i != ids_dims.size(); ++i) {
+      last_dim *= ids_dims[i];
+    }
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* ids_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Ids")[0]);
+      const auto& ids_lod = ids_var->Get<LoDTensor>().lod();
+
+      // in run time, the LoD of ids must be 1
+      PADDLE_ENFORCE(ids_lod.size(), 1u,
+                     "The LoD level of Input(Ids) must be 1");
+      PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty");
+
+      int64_t batch_size = ids_lod[0].size() - 1;
+
+      // in run time, the shape from Ids -> output
+      // should be [seq_length, 1] -> [batch_size, embedding_size]
+      ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim}));
+    } else {
+      // in compile time, the lod level of ids must be 1
+      framework::VarDesc* ids_desc =
+          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Ids")[0]);
+      PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1);
+
+      // in compile time, the shape from Ids -> output
+      // should be [-1, 1] -> [-1, embedding_size]
+      ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim}));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("W",
+             "(Tensor) The input represents embedding tensors, "
+             "which is a learnable parameter.");
+    AddInput("Ids",
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "The last dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
+    AddAttr<std::string>("combiner",
+                         "(string, default sum) "
+                         "A string specifying the reduction op. Currently sum "
+                         "are supported, sum computes the weighted sum of the "
+                         "embedding results for each row.")
+        .SetDefault("sum");
+    // NOTE(minqiyang): grad_inplace is an temporal attribute,
+    // please do NOT set this attribute in python layer.
+    AddAttr<bool>("grad_inplace",
+                  "(boolean, default false) "
+                  "If the grad op reuse the input's variable.")
+        .SetDefault(false);
+    AddAttr<bool>("is_sparse",
+                  "(boolean, default false) "
+                  "Sparse update.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+FusedEmbeddingSeqPool Operator.
+
+Computes embeddings for the given ids and weights.
+
+This operator is used to perform lookups on the parameter W,
+then computes the weighted sum of the lookups results for each row
+and concatenated into a dense tensor.
+
+The input Ids should carry the LoD (Level of Details) information.
+And the output will change the LoD information with input Ids.
+
+)DOC");
+  }
+};
+
+class FusedEmbeddingSeqPoolOpGradDescMaker
+    : public framework::DefaultGradOpDescMaker<true> {
+  using ::paddle::framework::DefaultGradOpDescMaker<
+      true>::DefaultGradOpDescMaker;
+
+ protected:
+  virtual std::string GradOpType() const {
+    return "fused_embedding_seq_pool_grad";
+  }
+};
+
+class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto table_dims = ctx->GetInputDim("W");
+    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class FusedEmbeddingSeqPoolOpGradVarTypeInference
+    : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto attr = op_desc.GetAttr("is_sparse");
+    bool is_sparse = boost::get<bool>(attr);
+    if (is_sparse) {
+      VLOG(3) << "fused_embedding_seq_pool_grad op "
+              << framework::GradVarName("W") << " is set to SelectedRows";
+      block->Var(out_var_name)
+          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+    } else {
+      VLOG(3) << "fused_embedding_seq_pool_grad op "
+              << framework::GradVarName("W") << " is set to LoDTensor";
+      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
+    }
+    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp,
+                  ops::FusedEmbeddingSeqPoolOpGradDescMaker,
+                  ops::FusedEmbeddingSeqPoolOpMaker);
+REGISTER_OPERATOR(fused_embedding_seq_pool_grad,
+                  ops::FusedEmbeddingSeqPoolOpGrad,
+                  ops::FusedEmbeddingSeqPoolOpGradVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool,
+                       ops::FusedEmbeddingSeqPoolKernel<float>,
+                       ops::FusedEmbeddingSeqPoolKernel<double>);
+REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad,
+                       ops::FusedEmbeddingSeqPoolGradKernel<float>,
+                       ops::FusedEmbeddingSeqPoolGradKernel<double>);
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
new file mode 100644
index 0000000000..758432fd9e
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+template <typename T>
+struct EmbeddingVSumFunctor {
+  void operator()(const framework::ExecutionContext &context,
+                  const LoDTensor *table_t, const LoDTensor *ids_t,
+                  LoDTensor *output_t) {
+    auto *table = table_t->data<T>();
+    int64_t row_number = table_t->dims()[0];
+    int64_t row_width = table_t->dims()[1];
+    int64_t last_dim = output_t->dims()[1];
+    const int64_t *ids = ids_t->data<int64_t>();
+    auto ids_lod = ids_t->lod()[0];
+    int64_t ids_count = ids_t->numel() / ids_lod.back();
+
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    for (int64_t i = 0; i != ids_lod.size() - 1; ++i) {
+      size_t begin = ids_lod[i] * ids_count;
+      for (int64_t j = 0; j != ids_count; ++j) {
+        PADDLE_ENFORCE_LT(ids[begin], row_number);
+        PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i);
+        blas.VCOPY(row_width, table + ids[begin + j] * row_width,
+                   output + i * last_dim + j * row_width);
+      }
+
+      for (int64_t r = (ids_lod[i] + 1) * ids_count;
+           r < ids_lod[i + 1] * ids_count; ++r) {
+        PADDLE_ENFORCE_LT(ids[r], row_number);
+        PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i);
+        blas.AXPY(row_width, 1., table + ids[r] * row_width,
+                  output + i * last_dim + (r % ids_count) * row_width);
+      }
+    }
+  }
+};
+
+template <typename T>
+class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const LoDTensor *ids_t = context.Input<LoDTensor>("Ids");  // int tensor
+    LoDTensor *output_t = context.Output<LoDTensor>("Out");    // float tensor
+    const LoDTensor *table_var = context.Input<LoDTensor>("W");
+    const std::string &combiner_type = context.Attr<std::string>("combiner");
+
+    if (combiner_type == "sum") {
+      EmbeddingVSumFunctor<T> functor;
+      functor(context, table_var, ids_t, output_t);
+    }
+  }
+};
+
+template <typename T>
+class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    DDim table_dim;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
+    }
+
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    if (is_sparse) {
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+
+      auto *ids_data = ids->data<int64_t>();
+      int64_t ids_num = ids->numel();
+      auto lod = ids->lod()[0];
+      int64_t row_width = d_output->dims()[1];
+
+      framework::Vector<int64_t> *new_rows = d_table->mutable_rows();
+      new_rows->resize(ids_num);
+      std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t));
+
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_num, table_dim[1]});
+      T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
+      const T *d_output_data = d_output->data<T>();
+
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+        int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+        int64_t in_offset = lod[i] * row_width;
+        const T *out_pos = d_output_data + i * row_width;
+        T *in_pos = d_table_data + in_offset;
+        for (int r = 0; r != h; ++r) {
+          blas.VCOPY(row_width, out_pos, in_pos + r * row_width);
+        }
+      }
+    } else {
+      LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
new file mode 100644
index 0000000000..4690bd766d
--- /dev/null
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class ConvInceptionFusionOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // 1 x
+    auto in_dims = ctx->GetInputDim("Input");
+    // 4 filters
+    auto w_dims = ctx->GetInputsDim("Filter");
+
+    PADDLE_ENFORCE(in_dims.size(), 4, "Conv intput should be 4-D tensor.");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 4, "There should be 4 filters");
+    PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1]);
+    PADDLE_ENFORCE_EQ(w_dims[1][1], in_dims[1]);
+
+    int n = in_dims[0];
+    // compute output channel
+    // 1st channel
+    int c = w_dims[0][0];
+    // add 2nd channel
+    c += (w_dims[1][0] - w_dims[2][1] * 2);
+    // add 3rd channel
+    c += (w_dims[2][0] - w_dims[3][1]);
+    // add 4-th channel
+    c += w_dims[3][0];
+
+    int h = in_dims[2];
+    int w = in_dims[3];
+
+    ctx->SetOutputDim("Output", {n, c, h, w});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
+  }
+};
+
+class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker {
+ protected:
+  void Make() override {
+    AddInput("Input", "(Tensor) NCHW layout.");
+    AddInput("Filter", "(vector<Tensor>) 4 aggregated filters").AsDuplicable();
+    AddInput("Bias", "(vector<Tensor>) it's lenght is equal to Filter")
+        .AsDuplicable();
+    AddOutput("Output",
+              "(Tensor) The output tensor of convolution operator. "
+              "The format of output tensor is also NCHW.");
+    AddOutput("TempOutput", "").AsDuplicable();
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddAttr<bool>(
+        "exclusive",
+        "(bool, default True) When true, will exclude the zero-padding in the "
+        "averaging calculating, otherwise, include the zero-padding. Note, it "
+        "is only used when pooling_type is avg. The defalut is True.")
+        .SetDefault(true);
+    AddAttr<std::string>(
+        "activation",
+        "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' "
+        "'relux' , 'tanh', 'band_pass'")
+        .SetDefault("relu");
+    AddAttr<int>("workspace_size_MB",
+                 "Only used in cudnn kernel. Need set use_cudnn to true."
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardware. This size should be chosen carefully.")
+        .SetDefault(4096);
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conv2d_inception_fusion, ops::ConvInceptionFusionOp,
+                  ops::ConvInceptionFusionOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
new file mode 100644
index 0000000000..6e13887866
--- /dev/null
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -0,0 +1,272 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+DECLARE_uint64(conv_workspace_size_limit);
+
+namespace paddle {
+namespace operators {
+
+#if CUDNN_VERSION >= 7100
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
+using DataLayout = platform::DataLayout;
+
+using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
+using PoolingMode = platform::PoolingMode;
+template <typename T>
+using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
+
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+template <typename T>
+class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto* input = ctx.Input<Tensor>("Input");
+    auto filters = ctx.MultiInput<framework::Tensor>("Filter");
+    auto bias = ctx.MultiInput<framework::Tensor>("Bias");
+
+    auto* output = ctx.Output<Tensor>("Output");
+    auto temp_outs = ctx.MultiOutput<framework::Tensor>("TempOutput");
+
+    const std::string pool_type = ctx.Attr<std::string>("pooling_type");
+    const std::string activation = ctx.Attr<std::string>("activation");
+    const bool exclusive = ctx.Attr<bool>("exclusive");
+
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    T* temp_data = temp_outs[0]->mutable_data<T>(input->dims(), ctx.GetPlace());
+
+    DataLayout layout = DataLayout::kNCHW;
+    std::vector<int> in_dim = framework::vectorize2int(input->dims());
+
+    // ------------------- cudnn descriptors ---------------------
+    PoolingMode pooling_mode;
+    if (pool_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                               : (PoolingMode::kAverageInclusive);
+    }
+    std::vector<int> k0x0 = {0, 0};
+    std::vector<int> k1x1 = {1, 1};
+    std::vector<int> k1x1_2 = {1, 1};
+    std::vector<int> k3x3 = {3, 3};
+    ScopedPoolingDescriptor pool_desc;
+    ScopedActivationDescriptor act_desc;
+    ScopedTensorDescriptor out_pool_desc;
+    ScopedTensorDescriptor input_desc;
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1);
+
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+
+    cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
+    cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4];
+    cudnnFilterDescriptor_t* filter_desc = new cudnnFilterDescriptor_t[4];
+    cudnnTensorDescriptor_t* bias_desc = new cudnnTensorDescriptor_t[4];
+    cudnnTensorDescriptor_t* in_desc = new cudnnTensorDescriptor_t[4];
+    cudnnConvolutionDescriptor_t* conv_desc =
+        new cudnnConvolutionDescriptor_t[4];
+    for (int i = 0; i < 4; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
+    }
+
+    std::vector<std::vector<int>> filter_dims;
+    std::vector<std::vector<int>> bias_dims;
+    std::vector<std::vector<int>> in_dims;
+    std::vector<std::vector<int>> out_dims;
+    std::vector<std::vector<int>> in_strides;
+    std::vector<std::vector<int>> out_strides;
+    std::vector<std::vector<int>> bias_strides;
+
+    cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
+    int n = in_dim[0];
+    int h = in_dim[2];
+    int w = in_dim[3];
+    int oc = output->dims()[1];
+
+    cudnnDataType_t compute_type = (cudnn_dtype == CUDNN_DATA_DOUBLE)
+                                       ? CUDNN_DATA_DOUBLE
+                                       : CUDNN_DATA_FLOAT;
+
+    for (int i = 0; i < 4; ++i) {
+      filter_dims.push_back(framework::vectorize2int(filters[i]->dims()));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+          filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
+      bias_dims.push_back({1, filter_dims[i][0], 1, 1});
+      bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(),
+          bias_strides[i].data()));
+      in_dims.push_back({n, filter_dims[i][1], h, w});
+      out_dims.push_back({n, filter_dims[i][0], h, w});
+      in_strides.push_back({filter_dims[i][1] * h * w, h * w, w, 1});
+      out_strides.push_back({oc * h * w, h * w, w, 1});
+
+      if (i < 2) {
+        CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
+            conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(),
+            CUDNN_CROSS_CORRELATION, compute_type));
+      } else {
+        CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor(
+            conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(),
+            CUDNN_CROSS_CORRELATION, compute_type));
+      }
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          conv_desc[i], CUDNN_DEFAULT_MATH));
+    }
+    in_dims[2][1] *= 2;
+    in_strides[2][0] = oc * h * w;
+    out_strides[2][0] = filter_dims[2][0] * h * w;  // this out is continuous.
+    in_strides[3][0] = filter_dims[2][0] * h * w;
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
+
+    cudnnConvolutionFwdAlgo_t algo[4];
+    auto handle = dev_ctx.cudnn_handle();
+    size_t workspace_size_in_bytes = 0;  // final workspace to allocate.
+
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
+      int64_t max_user_size =
+          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+                   user_workspace_size);
+      workspace_size_limit = max_user_size * 1024 * 1024;
+    }
+
+    for (int i = 0; i < 4; ++i) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
+          out_strides[i].data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
+          CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit,
+          &algo[i]));
+      size_t tmp_size = 0;
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
+          algo[i], &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+    cudnnActivationDescriptor_t cudnn_act_desc =
+        act_desc.descriptor<T>(activation);
+
+    int oc0 = filter_dims[0][0];
+    int oc1 = filter_dims[1][0] - filter_dims[2][1] * 2;
+    int oc3 = filter_dims[3][0];
+    int oc2 = oc - oc0 - oc1 - oc3;
+
+    // branch1: pool + 1x1 conv
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
+        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
+        pool_out_desc, temp_data));
+
+    std::vector<const void*> in_datas;
+    in_datas.push_back(static_cast<const void*>(temp_data));
+    in_datas.push_back(static_cast<const void*>(input_data));
+    in_datas.push_back(
+        static_cast<const void*>(output_data + (oc0 + oc1) * h * w));
+    T* temp2_data = temp_outs[1]->mutable_data<T>(
+        framework::make_ddim(out_dims[2]), ctx.GetPlace());
+    in_datas.push_back(static_cast<const void*>(temp2_data + oc2 * h * w));
+
+    std::vector<void*> out_datas;
+    out_datas.push_back(static_cast<void*>(output_data));
+    out_datas.push_back(static_cast<void*>(output_data + oc0 * h * w));
+    out_datas.push_back(static_cast<void*>(temp2_data));
+    out_datas.push_back(
+        static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
+
+    for (int i = 0; i < 4; ++i) {
+      auto func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+            handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
+            static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
+            algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
+            out_desc[i], out_datas[i], bias_desc[i],
+            static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
+            out_desc[i], out_datas[i]));
+      };
+      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+      workspace_handle.RunFunc(func, workspace_size_in_bytes);
+    }
+
+    cudnnTensorDescriptor_t x_desc;
+    cudnnTensorDescriptor_t y_desc;
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
+    CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor(
+        handle, CudnnDataType<T>::kOne(), x_desc,
+        static_cast<const void*>(out_datas[2]), CudnnDataType<T>::kZero(),
+        y_desc, static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
+
+    for (int i = 0; i < 4; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
+    }
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
+  }
+};
+#endif
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDNN_VERSION >= 7100
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion,
+                        ops::CUDNNConvInceptionFusionOpKernel<float>,
+                        ops::CUDNNConvInceptionFusionOpKernel<double>);
+#endif
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index a807117115..6ca6f0bc04 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -67,6 +67,11 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
                    "Output(PreOut) should not be null.");
+    auto with_prefetch = ctx->Attrs().Get<bool>("remote_prefetch");
+    if (with_prefetch) {
+      PADDLE_ENFORCE(ctx->HasOutput("W_Out"),
+                     "Output(W_Out) should not be null.");
+    }
     const int64_t batch_size = ctx->GetInputDim("X")[0];
     std::vector<int64_t> output_shape({batch_size, 1});
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
@@ -95,7 +100,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Label",
              "(LoDTensor, required), The labels of training data. It's a"
              "tensor with shape [N, 1].");
-    AddInput("PTable",
+    AddInput("PathTable",
              "(LoDTensor, optional), The Path Table from root to current word"
              "it should have shape like [N, L], L is the length of the Path")
         .AsDispensable();
@@ -119,8 +124,30 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
               "[batch_size, code_length], where code_length represents the "
               "maximum path length from root to leaf nodes.")
         .AsIntermediate();
+    AddOutput(
+        "W_Out",
+        "(LoDTensor, optinal) using input 'W' as Output to make it mutable"
+        "When we are using prefetch")
+        .AsIntermediate();
     AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
         .SetDefault(2);
+    // for parameter prefetch
+    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<int>>("height_sections",
+                              "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "table_names",
+        "(string vector, the splited table names that will be fetched from "
+        "parameter server)"
+        "in the order of input variables for mapping")
+        .SetDefault({});
     AddComment(R"DOC(
 The hierarchical sigmoid operator organize the classes into a binary tree.
 At each node, a sigmoid function is used to calculate the probability of
@@ -189,23 +216,17 @@ class HierarchicalSigmoidGradOpGradVarTypeInference
                << " is set to SelectedRows";
       block->Var(w_grad_var_name)
           ->SetType(framework::proto::VarType::SELECTED_ROWS);
-      if (hasBias) {
-        VLOG(30) << "hierarchical_sigmoid_grad op "
-                 << framework::GradVarName("Bias") << " is set to SelectedRows";
-        block->Var(bias_grad_var_name)
-            ->SetType(framework::proto::VarType::SELECTED_ROWS);
-      }
     } else {
       VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
                << " is set to LoDTensor";
       block->Var(w_grad_var_name)
           ->SetType(framework::proto::VarType::LOD_TENSOR);
-      if (hasBias) {
-        VLOG(30) << "hierarchical_sigmoid_grad op "
-                 << framework::GradVarName("Bias") << " is set to LoDTensor";
-        block->Var(bias_grad_var_name)
-            ->SetType(framework::proto::VarType::LOD_TENSOR);
-      }
+    }
+    if (hasBias) {
+      VLOG(30) << "hierarchical_sigmoid_grad op "
+               << framework::GradVarName("Bias") << " is set to LoDTensor";
+      block->Var(bias_grad_var_name)
+          ->SetType(framework::proto::VarType::LOD_TENSOR);
     }
     block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType());
   }
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index d212e6f843..1a7ca96301 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <iostream>
+#include <iterator>
 #include <set>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -24,6 +26,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include "paddle/fluid/platform/transform.h"
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -34,8 +40,9 @@ using platform::Transform;
 
 static std::vector<int64_t> PathToRows(const framework::LoDTensor& path) {
   std::set<int64_t> rows;
+  const int64_t* paths = path.data<int64_t>();
   for (int64_t i = 0; i < path.numel(); ++i) {
-    int64_t row = path.data<int64_t>()[i];
+    int64_t row = paths[i];
     if (row < 0) {
       continue;
     }
@@ -49,13 +56,54 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
     auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
-    auto* path = ctx.Input<framework::LoDTensor>("PTable");
+    auto* path = ctx.Input<framework::LoDTensor>("PathTable");
     auto* code = ctx.Input<framework::LoDTensor>("PathCode");
     auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
     auto* bias = ctx.Input<framework::LoDTensor>("Bias");
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
     size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    // for remote prefetch
+
+    auto epmap = ctx.Attr<std::vector<std::string>>("epmap");
+    if (!epmap.empty()) {
+      // if epmap is not empty, then the parameter will be fetched from remote
+      // parameter
+      // server
+      auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
+      auto table_names = ctx.Attr<std::vector<std::string>>("table_names");
+      std::vector<int64_t> real_rows = PathToRows(*path);
+      framework::Scope& local_scope = ctx.scope().NewScope();
+      auto* ids = local_scope.Var("Ids@Prefetch");
+      auto* x_tensor = ids->GetMutable<framework::LoDTensor>();
+
+      x_tensor->mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(real_rows.size()), 1}),
+          ctx.GetPlace());
+      // copy.
+
+      std::memcpy(x_tensor->data<int64_t>(), real_rows.data(),
+                  real_rows.size() * sizeof(int64_t));
+
+      framework::DDim w_dims = ctx.Input<Tensor>("W")->dims();
+      w_dims[0] = x_tensor->dims()[0];
+      auto* w_tensor =
+          local_scope.Var("W@Prefetch")->GetMutable<framework::LoDTensor>();
+      w_tensor->Resize(w_dims);
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+      // w_Out is set to used by prefetch, never change it in other cases
+      auto* w_out = ctx.Output<framework::LoDTensor>("W_Out");
+      operators::distributed::prefetch_with_reconstruct<T>(
+          "Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections,
+          ctx, local_scope, w_out);
+#else
+      PADDLE_THROW(
+          "paddle is not compiled with distribute support, can not do "
+          "parameter prefetch!");
+#endif
+    }
+
     bool is_custom = false;
     if (path) {
       is_custom = true;
@@ -116,9 +164,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
     auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
-    auto* path = ctx.Input<framework::LoDTensor>("PTable");
+    auto* path = ctx.Input<framework::LoDTensor>("PathTable");
     auto* code = ctx.Input<framework::LoDTensor>("PathCode");
-    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
     auto* in_grad =
         ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
     bool is_sparse = ctx.Attr<bool>("is_sparse");
@@ -173,15 +220,14 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
     }
     // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
     // be consistent with the clipping in forward.
-
+    auto* bias_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(ctx.GetPlace());
+      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
+      bit_code->AddGrad(pre_out_grad, bias_grad);
+    }
     if (!is_sparse) {
-      auto* bias_grad =
-          ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
-      if (bias_grad) {
-        bias_grad->mutable_data<T>(ctx.GetPlace());
-        zero(dev_ctx, bias_grad, static_cast<T>(0.0));
-        bit_code->AddGrad(pre_out_grad, bias_grad);
-      }
       auto* w_grad =
           ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
       w_grad->mutable_data<T>(ctx.GetPlace());
@@ -200,21 +246,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
 
       w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
       zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
-      auto* bias_grad =
-          ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
-      if (bias_grad) {
-        bias_grad->set_rows(real_rows);
-        // build ids -> rows index map
-        bias_grad->SyncIndex();
-        bias_grad->set_height(bias->dims()[0]);
-        auto* bias_grad_value = bias_grad->mutable_value();
-        std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
-                                     bias->dims()[1]};
-        bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
-                                         ctx.GetPlace());
-        zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
-        bit_code->AddGrad(pre_out_grad, bias_grad);
-      }
       bit_code->MulGradWeight(pre_out_grad, w_grad, in);
     }
     bit_code->MulGradError(pre_out_grad, w, in_grad);
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index 69e7fa4490..f458ce6c83 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
                    "Input(Logits@GRAD) should not be null.");
 
     auto pred_dims = ctx->GetInputDim("Logits");
-    auto lab_dims = ctx->GetInputDim("Labels");
     auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
 
     PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h
index 9efda3dfc9..fa21bd01cb 100644
--- a/paddle/fluid/operators/huber_loss_op.h
+++ b/paddle/fluid/operators/huber_loss_op.h
@@ -105,14 +105,16 @@ class HuberLossGradKernel : public framework::OpKernel<T> {
       out0->mutable_data<T>(context.GetPlace());
       auto x_grad = EigenVector<T>::Flatten(*out0);
       x_grad.device(place) =
-          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
+          residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
+      x_grad.device(place) = out_grad * x_grad;
     }
 
     if (out1) {
       out1->mutable_data<T>(context.GetPlace());
       auto y_grad = EigenVector<T>::Flatten(*out1);
       y_grad.device(place) =
-          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
+          residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
+      y_grad.device(place) = out_grad * y_grad;
     }
   }
 };
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index 998b7f09c3..1da14631e3 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -230,10 +230,12 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 
     if (ctx->HasOutput(framework::GradVarName("Emission"))) {
       ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims);
+      ctx->ShareLoD("Emission", framework::GradVarName("Emission"));
     }
     if (ctx->HasOutput(framework::GradVarName("Transition"))) {
       ctx->SetOutputDim(framework::GradVarName("Transition"),
                         transition_exps_dims);
+      ctx->ShareLoD("Transition", framework::GradVarName("Transition"));
     }
   }
 
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index e28d199eeb..c4a2282e16 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -38,13 +38,13 @@ class LoadCombineOp : public framework::OperatorBase {
         static_cast<int>(out_var_names.size()), 0,
         "The number of output variables should be greater than 0.");
     if (!model_from_memory) {
-      std::ifstream fin(filename);
+      std::ifstream fin(filename, std::ios::binary);
       PADDLE_ENFORCE(static_cast<bool>(fin),
                      "Cannot open file %s for load_combine op", filename);
       LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
     } else {
       PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
-      std::stringstream fin(filename);
+      std::stringstream fin(filename, std::ios::in | std::ios::binary);
       LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
     }
   }
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 06773d1d0e..4bce4eba22 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -34,7 +34,7 @@ class LoadOp : public framework::OperatorBase {
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
-    std::ifstream fin(filename);
+    std::ifstream fin(filename, std::ios::binary);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
                    filename);
 
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index 9d248e0321..ef1fb83aa6 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel {
                    "Output(Predicted@GRAD) should not be null.");
 
     auto pred_dims = ctx->GetInputDim("Predicted");
-    auto label_dims = ctx->GetInputDim("Labels");
     auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
     PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
 
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 6a0d6bad51..fd15539f7b 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -92,7 +92,8 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 // server
 #ifdef PADDLE_WITH_DISTRIBUTE
       operators::distributed::prefetch(id_name, out_name, table_names, epmap,
-                                       height_sections, context);
+                                       height_sections, context,
+                                       context.scope());
 #else
       PADDLE_THROW(
           "paddle is not compiled with distribute support, can not do "
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 3a73a7637c..a7d0fd4856 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -59,7 +59,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
 // server
 #ifdef PADDLE_WITH_DISTRIBUTE
       operators::distributed::prefetch(id_name, out_name, table_names, epmap,
-                                       height_sections, context);
+                                       height_sections, context,
+                                       context.scope());
 #else
       PADDLE_THROW(
           "paddle is not compiled with distribute support, can not do "
diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
index 0a18882e81..4e4f977fcc 100644
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -50,8 +50,8 @@ template <typename T>
 class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(std::is_same<T, float>::value,
-                   "MKLDNN LRN must use float data.");
+    const bool is_float_type = std::is_same<T, float>::value;
+    PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data.");
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "MKLDNN LRN must use CPUPlace.");
 
@@ -132,8 +132,8 @@ template <typename T>
 class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(std::is_same<T, float>::value,
-                   "MKLDNN LRN must use float data.");
+    const bool is_float_type = std::is_same<T, float>::value;
+    PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data.");
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "MKLDNN LRN must use CPUPlace.");
     PADDLE_ENFORCE(
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
index 895a7019aa..d1127ce4a2 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -37,9 +37,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
   for (int i = 0; i < Rank; i++) {
     permute[i] = axis[i];
   }
-  auto in_dim = in.dims();
-  auto out_dim = out->dims();
-
   auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
   auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
   auto* dev = context.eigen_device();
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index d55e832cc2..d6f51c6e5c 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -84,41 +84,6 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
   code_table_.apply_visitor(func);
 }
 
-template <typename T>
-struct MatrixBitCodeFunctorSelectedRowsAddGrad
-    : public boost::static_visitor<void> {
-  const framework::Tensor &tmat_;
-  framework::SelectedRows *vec_;
-
-  MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat,
-                                          framework::SelectedRows *vec)
-      : tmat_(tmat), vec_(vec) {}
-
-  template <typename CodeTable>
-  void operator()(const CodeTable &code_table) {
-    size_t batch_size = tmat_.dims()[0];
-    size_t width = tmat_.dims()[1];
-    auto *vec_data = vec_->mutable_value()->template data<T>();
-    auto *tmat_data = tmat_.data<T>();
-    for (size_t i = 0; i < batch_size; ++i) {
-      auto code = code_table.get_code(i);
-      int code_length = code.get_length();
-      for (int j = 0; j < code_length; ++j) {
-        size_t index = code.calc_index(j);
-        int64_t row_index = vec_->GetIndexFromId(static_cast<int64_t>(index));
-        vec_data[row_index] += tmat_data[i * width + j];
-      }
-    }
-  }
-};
-
-template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
-                                      framework::SelectedRows *vec) {
-  MatrixBitCodeFunctorSelectedRowsAddGrad<T> func(tmat, vec);
-  code_table_.apply_visitor(func);
-}
-
 template <typename T>
 struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> {
   const framework::Tensor &tmat_;
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 01e4889d34..c399cb5d44 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -124,11 +124,12 @@ class SimpleCode {
 template <typename T>
 class CustomCode {
  public:
-  CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
-             const int64_t* ids, int index) {
-    seq_len_ = ptable.dims()[1];
-    ptable_data_ = ptable.data<T>() + seq_len_ * index;
-    pcode_data_ = pcode.data<T>() + seq_len_ * index;
+  CustomCode(const framework::Tensor& path_table,
+             const framework::Tensor& path_code, const int64_t* ids,
+             int index) {
+    seq_len_ = path_table.dims()[1];
+    path_table_data_ = path_table.data<T>() + seq_len_ * index;
+    path_code_data_ = path_code.data<T>() + seq_len_ * index;
   }
   /**
    * Here the id of root should be 1 rather than 0, thus the encoding of class c
@@ -139,25 +140,25 @@ class CustomCode {
    * Binary classification path is the suffixes of encoding, thus leave out the
    * left most bit in calc_bit.
    */
-  size_t calc_index(int bit) const { return ptable_data_[bit]; }
-  bool calc_bit(int bit) const { return pcode_data_[bit]; }
+  size_t calc_index(int bit) const { return path_table_data_[bit]; }
+  bool calc_bit(int bit) const { return path_code_data_[bit]; }
 
   // NOTE: this function is not thread-safe.
   int get_length() const {
     if (length_ < 0) {
       auto len = seq_len_;
-      length_ =
-          static_cast<int>(std::find_if(ptable_data_, ptable_data_ + len,
-                                        [](const T& val) { return val < 0; }) -
-                           ptable_data_);
+      length_ = static_cast<int>(
+          std::find_if(path_table_data_, path_table_data_ + len,
+                       [](const T& val) { return val < 0; }) -
+          path_table_data_);
     }
     return length_;
   }
 
  private:
   int64_t seq_len_;
-  const T* ptable_data_;
-  const T* pcode_data_;
+  const T* path_table_data_;
+  const T* path_code_data_;
   mutable int length_{-1};
 };
 
@@ -181,9 +182,9 @@ class SimpleCodeTable {
 template <typename T>
 class CustomCodeTable {
  public:
-  CustomCodeTable(const framework::Tensor& ptable,
-                  const framework::Tensor& pcode, const int64_t* ids)
-      : ptable_(ptable), pcode_(pcode), ids_(ids) {}
+  CustomCodeTable(const framework::Tensor& path_table,
+                  const framework::Tensor& path_code, const int64_t* ids)
+      : ptable_(path_table), pcode_(path_code), ids_(ids) {}
 
   CustomCode<T> get_code(int64_t code) const {
     return CustomCode<T>(ptable_, pcode_, ids_, code);
@@ -210,11 +211,11 @@ class MatrixBitCodeFunctor {
         ids_(ids),
         code_table_(SimpleCodeTable(num_classes, ids)) {}
 
-  MatrixBitCodeFunctor(const framework::Tensor& ptable,
-                       const framework::Tensor& pcode, const int64_t* ids)
-      : num_classes_(static_cast<size_t>(ptable.dims()[1])),
+  MatrixBitCodeFunctor(const framework::Tensor& path_table,
+                       const framework::Tensor& path_code, const int64_t* ids)
+      : num_classes_(static_cast<size_t>(path_table.dims()[1])),
         ids_(ids),
-        code_table_(CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
+        code_table_(CustomCodeTable<int64_t>(path_table, path_code, ids)) {}
   /* For j < code_length
        tmat(i, j) += vec(0, index(i, j))
   */
@@ -225,11 +226,6 @@ class MatrixBitCodeFunctor {
   */
   void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
 
-  /* For selected rows For j < code_length
-       vec(0, index(i, j)) += tmat(i, j)
-  */
-  void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec);
-
   /* For j < code_length
     sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
   */
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 9e99e44822..1d9d98b106 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -76,7 +76,6 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
   void operator()(const DeviceContext& context, const framework::Tensor* X,
                   framework::Tensor* Y) {
     auto in_dims = X->dims();
-    auto out_dims = Y->dims();
     const float* in_data = X->data<float>();
     float* out_data = Y->data<float>();
     const int kBatchDim = 0;
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index 35db4c1ad1..9954e51083 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
                    "Input(Out@Grad) must not be null.");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
     auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
     auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 271428408c..05afdf5324 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -147,12 +147,6 @@ class MulGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) should not be null");
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-
-    auto x_mat_dims = framework::flatten_to_2d(
-        x_dims, ctx->Attrs().Get<int>("x_num_col_dims"));
-    auto y_mat_dims = framework::flatten_to_2d(
-        y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));
 
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 06c35c789f..256da34912 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -36,7 +36,6 @@ class NCEOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("Input");
     auto label_dims = ctx->GetInputDim("Label");
-    auto w_dims = ctx->GetInputDim("Weight");
     PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
     int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
     if (ctx->HasInput("Bias")) {
@@ -154,6 +153,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("is_sparse", "(boolean, default false) Sparse update.")
         .SetDefault(false);
 
+    // for parameter prefetch
+    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<int>>("height_sections",
+                              "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "table_names",
+        "(string vector, the splited table names that will be fetched from "
+        "parameter server)"
+        "in the order of input variables for mapping")
+        .SetDefault({});
+
     AddAttr<std::vector<int>>("custom_neg_classes",
                               "This attribute only be used in unitest. Classes "
                               "in this list wiil be used as negative classes "
@@ -223,24 +240,20 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference {
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
     auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front();
-    auto bias_grad = op_desc.Output(framework::GradVarName("Bias")).front();
 
     auto attr = op_desc.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
-      VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad
+      VLOG(3) << "nce_op_grad op " << weight_grad << " and "
               << " is set to SelectedRows";
       block->Var(weight_grad)
           ->SetType(framework::proto::VarType::SELECTED_ROWS);
-      block->Var(bias_grad)->SetType(framework::proto::VarType::SELECTED_ROWS);
     } else {
-      VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad
+      VLOG(3) << "nce_op_grad op " << weight_grad << " and "
               << " is set to LoDTensor";
       block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
-      block->Var(bias_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
     }
     block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType());
-    block->Var(bias_grad)->SetDataType(block->Var("Input")->GetDataType());
   }
 };
 
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index f2ca6ec247..2c97eef096 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <math.h>
+#include <iterator>
 #include <random>
 #include <set>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -24,6 +26,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/sampler.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -43,7 +49,6 @@ void PrepareSamples(const framework::ExecutionContext &context,
   auto label = context.Input<Tensor>("Label");
   const int64_t *label_data = label->data<int64_t>();
   auto label_dims = label->dims();
-  //  int num_total_classes = context.Attr<int>("num_total_classes");
   // for unitest
   std::vector<int> custom_neg_classes =
       context.Attr<std::vector<int>>("custom_neg_classes");
@@ -144,15 +149,82 @@ class NCEKernel : public framework::OpKernel<T> {
     }
     // forward mul
     auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
-    auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
-    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-      Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
-          (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
-           weight_mat.chip(sample_labels_data[i], 0))
-              .sum();
-      sample_out_data[i] += result(0);
-      sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
+
+    // for remote prefetch
+    auto epmap = context.Attr<std::vector<std::string>>("epmap");
+
+    if (!epmap.empty()) {
+      // if epmap is not empty, then the parameter will be fetched from remote
+      // parameter
+      // server
+
+      std::vector<int64_t> labels;
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        labels.push_back(sample_labels_data[i]);
+      }
+      std::set<T> st(labels.begin(), labels.end());
+      labels.assign(st.begin(), st.end());
+
+      framework::Scope &local_scope = context.scope().NewScope();
+
+      auto height_sections = context.Attr<std::vector<int>>("height_sections");
+      auto table_names = context.Attr<std::vector<std::string>>("table_names");
+
+      auto *ids = local_scope.Var("Ids@Prefetch");
+      auto *x_tensor = ids->GetMutable<framework::LoDTensor>();
+      x_tensor->mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(labels.size()), 1}),
+          context.GetPlace());
+      // copy.
+      std::memcpy(x_tensor->data<int64_t>(), labels.data(),
+                  labels.size() * sizeof(int64_t));
+
+      std::vector<int> w_dims = paddle::framework::vectorize2int(
+          context.Input<Tensor>("Weight")->dims());
+      w_dims[0] = static_cast<int>(labels.size());
+
+      auto *w_tensor = local_scope.Var("Weight@Prefetch")
+                           ->GetMutable<framework::LoDTensor>();
+      w_tensor->Resize(framework::make_ddim(w_dims));
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+      operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch",
+                                       table_names, epmap, height_sections,
+                                       context, local_scope);
+#else
+      PADDLE_THROW(
+          "paddle is not compiled with distribute support, can not do "
+          "parameter prefetch!");
+#endif
+
+      auto weight_mat = EigenMatrix<T>::From(
+          (local_scope.Var("Weight@Prefetch")->Get<framework::LoDTensor>()));
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        std::vector<int64_t>::iterator it =
+            std::find(labels.begin(), labels.end(), sample_labels_data[i]);
+        int idx = std::distance(labels.begin(), it);
+
+        Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
+            (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
+             weight_mat.chip(idx, 0))
+                .sum();
+        sample_out_data[i] += result(0);
+        sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
+      }
+      context.scope().DeleteScope(&local_scope);
+    } else {
+      auto weight_mat =
+          EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
+            (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
+             weight_mat.chip(sample_labels_data[i], 0))
+                .sum();
+        sample_out_data[i] += result(0);
+        sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
+      }
     }
+
     // forward cost
     for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
       out_data[i] = 0;
@@ -240,18 +312,19 @@ class NCEGradKernel : public framework::OpKernel<T> {
       sample_grad_data[i] *= d_out_data[sample_idx];
     }
 
+    // get d_bias
+    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
+    if (d_bias != nullptr) {
+      T *d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
+      std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
+      }
+    }
+
     bool is_sparse = context.Attr<bool>("is_sparse");
 
     if (!is_sparse) {
-      // get d_bias
-      auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
-      if (d_bias != nullptr) {
-        T *d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
-        std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
-        for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-          d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
-        }
-      }
       // get d_w
       auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
       if (d_w != nullptr) {
@@ -273,34 +346,6 @@ class NCEGradKernel : public framework::OpKernel<T> {
       std::set<T> st(labels.begin(), labels.end());
       labels.assign(st.begin(), st.end());
 
-      auto *bias_var = context.InputVar("Bias");
-      DDim bias_dim;
-      if (bias_var->IsType<LoDTensor>()) {
-        bias_dim = context.Input<LoDTensor>("Bias")->dims();
-      } else if (bias_var->IsType<SelectedRows>()) {
-        auto *table_t = context.Input<SelectedRows>("Bias");
-        bias_dim = table_t->value().dims();
-      } else {
-        PADDLE_THROW(
-            "The parameter Bias of a NCE_OP "
-            "must be either LoDTensor or SelectedRows");
-      }
-
-      auto d_bias =
-          context.Output<SelectedRows>(framework::GradVarName("Bias"));
-      d_bias->set_rows(labels);
-      d_bias->set_height(bias_dim[0]);
-
-      d_bias->mutable_value()->Resize(
-          {static_cast<int64_t>(labels.size()), bias_dim[1]});
-      T *d_bias_data =
-          d_bias->mutable_value()->mutable_data<T>(context.GetPlace());
-      std::fill(d_bias_data, d_bias_data + labels.size(), 0.0);
-      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_bias_data[d_bias->Index(sample_labels_data[i])] +=
-            sample_grad_data[i];
-      }
-
       auto *table_var = context.InputVar("Weight");
       DDim table_dim;
       if (table_var->IsType<LoDTensor>()) {
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
index 8e7457dd56..2a479081f1 100644
--- a/paddle/fluid/operators/ngraph/ngraph_ops.h
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@@ -23,5 +23,7 @@ limitations under the License. */
 
 #include "ops/binary_unnary_op.h"
 #include "ops/fill_constant_op.h"
+#include "ops/mean_op.h"
 #include "ops/mul_op.h"
+#include "ops/scale_op.h"
 #include "ops/top_k_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
new file mode 100644
index 0000000000..15fbd58b02
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h
@@ -0,0 +1,61 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+template <typename T>
+std::shared_ptr<ngraph::Node> ElementwiseScalar(
+    float scale, std::shared_ptr<ngraph::Node> node) {
+  auto node_shape = node->get_shape();
+  auto scale_const = ngraph::op::Constant::create(node->get_element_type(),
+                                                  node_shape, {scale});
+  return std::make_shared<T>(scale_const, node);
+}
+
+template <typename T>
+std::shared_ptr<ngraph::Node> ElementwiseScalar(
+    std::shared_ptr<ngraph::Node> scale_1d,
+    std::shared_ptr<ngraph::Node> node) {
+  auto scale_shape = scale_1d->get_shape();
+  PADDLE_ENFORCE_EQ(scale_shape.size(), 1, "Supporting 1d scale node");
+  PADDLE_ENFORCE_EQ(scale_shape.at(0), 1, "scale 1d in in shape {1}");
+
+  auto node_shape = node->get_shape();
+  ngraph::AxisSet axis_set;
+  for (size_t i = 0; i < node_shape.size(); ++i) {
+    axis_set.insert(i);
+  }
+  node_shape.push_back(1);
+
+  auto scale_bcast =
+      std::make_shared<ngraph::op::Broadcast>(scale_1d, node_shape, axis_set);
+
+  auto scale_reshape =
+      paddle::platform::NgReshaper(scale_bcast, node->get_shape());
+
+  return std::make_shared<T>(scale_reshape, node);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/ngraph/ops/mean_op.h b/paddle/fluid/operators/ngraph/ops/mean_op.h
new file mode 100644
index 0000000000..7fcf8f09cd
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/mean_op.h
@@ -0,0 +1,68 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <functional>
+#include <string>
+
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildMeanNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  ngraph::AxisSet axes;
+  for (size_t i = 0; i < input->get_shape().size(); ++i) {
+    axes.insert(i);
+  }
+
+  auto mean = ngraph::builder::mean(input, axes);
+  auto mean_1d = std::make_shared<ngraph::op::Reshape>(
+      mean, ngraph::AxisVector{}, ngraph::Shape{1});
+  paddle::platform::SetOutputNode(op, "Out", mean_1d, ngb_node_map);
+}
+
+void BuildMeanGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto og = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+  auto x_shape = x->get_shape();
+  float x_size = std::accumulate(std::begin(x_shape), std::end(x_shape), 1,
+                                 std::multiplies<float>());
+  auto node_const = ngraph::op::Constant::create(og->get_element_type(),
+                                                 ngraph::Shape{1}, {x_size});
+  auto node_div = std::make_shared<ngraph::op::Divide>(og, node_const);
+
+  auto result = ElementwiseScalar<ngraph::op::Add>(
+      og / node_const,
+      ngraph::op::Constant::create(og->get_element_type(), x_shape, {0}));
+  paddle::platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/ngraph/ops/scale_op.h b/paddle/fluid/operators/ngraph/ops/scale_op.h
new file mode 100644
index 0000000000..24ab0702aa
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/scale_op.h
@@ -0,0 +1,41 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+void BuildScaleNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  float scale = op_attrs.Get<float>("scale");
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto out = ElementwiseScalar<ngraph::op::Multiply>(scale, x);
+  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h
index d0224177ec..6c95d3f3bf 100644
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
@@ -43,7 +43,6 @@ class NormKernel : public framework::OpKernel<T> {
     out_norm->mutable_data<T>(ctx.GetPlace());
 
     auto xdim = in_x->dims();
-    auto ndim = out_norm->dims();
     T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis = xdim.size() + axis;
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h
index 6c616aa03d..3f51bb0b3d 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.h
+++ b/paddle/fluid/operators/optimizers/adadelta_op.h
@@ -27,12 +27,14 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
     const auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+                   ctx.Inputs("Grad").front(),
+                   framework::ToTypeName(grad_var->Type()));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto avg_squared_grad_out_tensor =
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h
index 9f6ef39169..13455fc42c 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/adagrad_op.h
@@ -50,7 +50,8 @@ class AdagradOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
 
     auto *param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto *moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 1138bb7400..61b9384f84 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -347,7 +347,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
 
     using paddle::framework::LoDTensor;
     using paddle::operators::detail::Ref;
@@ -423,16 +424,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
         }
       }
 
+      framework::SelectedRows cpu_grad_merge;
       const framework::SelectedRows* grad_merge_ptr;
       if (is_strict_sorted) {
         grad_merge_ptr = &grad;
       } else {
         // merge duplicated rows if any.
         // The rows of grad_merge have been sorted inside MergeAdd functor
+        framework::SelectedRows* grad_merge_var;
         scatter::MergeAdd<DeviceContext, T> merge_func;
-        auto* grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
-                                   .Var()
-                                   ->GetMutable<framework::SelectedRows>();
+        if (platform::is_cpu_place(ctx.GetPlace())) {
+          grad_merge_var = &cpu_grad_merge;
+        } else {
+          // FIXME(qiao): GPU also need to fix this
+          grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
+                               .Var()
+                               ->GetMutable<framework::SelectedRows>();
+        }
         merge_func(ctx.template device_context<DeviceContext>(), grad,
                    grad_merge_var, true);
         grad_merge_ptr = grad_merge_var;
diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h
index 7137fbd965..55d25ecbdd 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.h
+++ b/paddle/fluid/operators/optimizers/adamax_op.h
@@ -27,12 +27,14 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
     const auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+                   ctx.Inputs("Grad").front(),
+                   framework::ToTypeName(grad_var->Type()));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
index 5df43d33ef..4abd436927 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
@@ -27,12 +27,14 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
     const auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+                   ctx.Inputs("Grad").front(),
+                   framework::ToTypeName(grad_var->Type()));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index 8f812c9a03..bbf34d8316 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -32,12 +32,14 @@ class FTRLOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
     const auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Grad").front(), grad_var->Type().name());
+                   ctx.Inputs("Grad").front(),
+                   framework::ToTypeName(grad_var->Type()));
 
     auto* param_out = ctx.Output<Tensor>("ParamOut");
     auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index f6ef83c3ba..3ed1bff5ff 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -395,7 +395,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
       PADDLE_THROW(
           string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
                           "gradient, but the received Variable Type is %s",
-                          grad_var->Type().name()));
+                          framework::ToTypeName(grad_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index a9d303d55d..975e4b8e72 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -60,7 +60,8 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
                    "The Var(%s)'s type should be LoDTensor, "
                    "but the received is %s",
-                   ctx.Inputs("Param").front(), param_var->Type().name());
+                   ctx.Inputs("Param").front(),
+                   framework::ToTypeName(param_var->Type()));
 
     auto* param = ctx.Input<framework::Tensor>("Param");
     auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index 0a9a29956a..f6f40b1daf 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/operators/pool_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
@@ -71,7 +72,6 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
-
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -130,20 +130,25 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
                           padding_right_bottom);
       }
-      auto src_md = platform::MKLDNNMemDesc(
-          src_tz, platform::MKLDNNGetDataType<T>(), input_format);
+
+      mkldnn::memory::data_type dt =
+          paddle::framework::ToMKLDNNDataType(input->type());
+
+      auto src_md = platform::MKLDNNMemDesc(src_tz, dt, input_format);
 
       /* create memory descriptor for pooling without specified format
        * ('any') which lets a primitive (pooling in this case) choose
        * the memory format preferred for best performance
        */
-      auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
-                                            mkldnn::memory::format::any);
-
+      auto dst_md =
+          platform::MKLDNNMemDesc(dst_tz, dt, mkldnn::memory::format::any);
+      auto propagation = src_md.data.data_type == mkldnn_f32
+                             ? mkldnn::prop_kind::forward_training
+                             : mkldnn::prop_kind::forward_scoring;
       std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
-          CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top,
-                              padding_right_bottom, ksize, pooling_type,
-                              mkldnn_engine, ceil_mode, is_test);
+          CreatePrimitiveDesc(src_md, dst_md, propagation, strides,
+                              padding_left_top, padding_right_bottom, ksize,
+                              pooling_type, mkldnn_engine, ceil_mode, is_test);
 
       // save pool_pd into global device context to be referred in backward path
       if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd);
@@ -203,7 +208,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  private:
   std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
       const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
-      const std::vector<int>& stride, const std::vector<int>& padding_left_top,
+      const mkldnn::prop_kind& propagation, const std::vector<int>& stride,
+      const std::vector<int>& padding_left_top,
       const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
       const std::string& pooling_type, const mkldnn::engine& engine,
       bool ceil_mode, bool is_test) const {
@@ -411,6 +417,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::PoolMKLDNNOpKernel<float>);
+                   ops::PoolMKLDNNOpKernel<float>,
+                   ops::PoolMKLDNNOpKernel<int8_t>,
+                   ops::PoolMKLDNNOpKernel<uint8_t>);
+
 REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::PoolMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
index 1a424728f7..5666613f6e 100644
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ b/paddle/fluid/operators/psroi_pool_op.h
@@ -41,7 +41,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
     int rois_num = rois->dims()[0];
 
     auto in_stride = framework::stride(in_dims);
-    auto roi_stride = framework::stride(rois->dims());
     auto out_stride = framework::stride(out->dims());
 
     const T* input_data = in->data<T>();
diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h
index 7fc07efe73..56879ffda5 100644
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -49,7 +49,7 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
 class CTRReader : public framework::FileReader {
  public:
   explicit CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
-                     int batch_size, int thread_num,
+                     int batch_size, size_t thread_num,
                      const std::vector<std::string>& slots,
                      const std::vector<std::string>& file_list)
       : batch_size_(batch_size), slots_(slots), file_list_(file_list) {
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index a0b9fa305d..d0edcc170f 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -49,7 +49,7 @@ class SaveCombineOp : public framework::OperatorBase {
     }
 
     MkDirRecursively(DirName(filename).c_str());
-    std::ofstream fout(filename);
+    std::ofstream fout(filename, std::ios::binary);
     PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
                    filename);
 
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index e1c9fd8ff1..fcc598f4f1 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -80,7 +80,7 @@ class SaveOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
-    std::ofstream fout(filename);
+    std::ofstream fout(filename, std::ios::binary);
     PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
                    filename);
 
@@ -122,7 +122,7 @@ class SaveOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
-    std::ofstream fout(filename);
+    std::ofstream fout(filename, std::ios::binary);
     PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
                    filename);
     framework::SerializeToStream(fout, selectedRows, dev_ctx);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
index 03b59d71cc..4bded0efb9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -143,8 +143,6 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
       set_zero(ctx.template device_context<DeviceContext>(), x_grad,
                static_cast<T>(0));
 
-      auto out_grad_stride = framework::stride(out_grad->dims());
-
       for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
         Tensor out_grad_t =
             out_grad->Slice(static_cast<int>(out_lod[0][i]),
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 767449cde9..5ede972c71 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -63,7 +63,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     }
     auto *mask_data = cpu_mask->data<bool>();
 
-    std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]);
+    std::vector<std::vector<CopyRange>> copy_ranges(2);
 
     // set out_true/out_false lod
     for (size_t t = 0; t < 2; t++) {
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index c3d83a06f2..6a99ad9a90 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -40,7 +40,7 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
                           const framework::DDim& dst_stride, T* dst) {
   paddle::operators::detail::StridedCopyDimVisitor<T> func(
       dev_ctx, src, src_stride, dst_stride, dst);
-  boost::apply_visitor(func, dst_dim);
+  dst_dim.apply_visitor(func);
 }
 
 // Strided numel memory copy from src to dst by the specified axis
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc
index f9a16ef35e..c39f94637a 100644
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -245,7 +245,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       }
     } else {
       PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   out_var->Type().name());
+                   framework::ToTypeName(out_var->Type()));
     }
   }
 };
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 83afe5819a..71fcaafe6b 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -132,7 +132,7 @@ class SumOp : public framework::OperatorWithKernel {
       PADDLE_THROW("Cannot find the input data type by all input data");
     }
     PADDLE_THROW("Unexpected branch. Input type is %s",
-                 x_vars[0]->Type().name());
+                 framework::ToTypeName(x_vars[0]->Type()));
   }
 };
 
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 76cc796a9b..a8b2df186d 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -163,7 +163,7 @@ class SumKernel : public framework::OpKernel<T> {
       }
     } else {
       PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   out_var->Type().name());
+                   framework::ToTypeName(out_var->Type()));
     }
   }
 };
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index d1dff16ddd..1f51b5bab3 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -84,6 +84,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
+cc_library(timer SRCS timer.cc)
+cc_test(timer_test SRCS timer_test.cc DEPS timer)
+
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
@@ -97,7 +100,7 @@ ENDIF()
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 
 if(WITH_GPU)
-    nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+    nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator)
 else()
-    cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+    cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator)
 endif()
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 022afb686b..6f38dbb7a2 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -285,7 +285,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
     if (dynload::HasCUDNN()) {
       auto local_cudnn_version = cudnn_dso_ver / 100;
       auto compile_cudnn_version = CUDNN_VERSION / 100;
-      if (local_cuda_version < compile_cuda_version) {
+      if (local_cudnn_version < compile_cudnn_version) {
         LOG_FIRST_N(WARNING, 1)
             << "WARNING: device: " << place_.device
             << ". The installed Paddle is compiled with CUDNN "
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index f3cd3b2bbe..91d9a1ef01 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R6
+CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#endif
+
 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 990e44cd21..15d5168366 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -53,6 +53,12 @@ namespace platform {
 namespace dynload {
 static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
 
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
+static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
+static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
+#endif
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
 void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -173,6 +181,8 @@ void* GetCublasDsoHandle() {
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
@@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() {
 void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 01ee67fd07..0668053950 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -140,68 +140,72 @@ struct EOFException : public std::exception {
 #define LIKELY(condition) (condition)
 #endif
 
+inline bool is_error(bool stat) { return !stat; }
+
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     bool stat, const Args&... args) {
-  if (UNLIKELY(!(stat))) {
 #ifndef REPLACE_ENFORCE_GLOG
-    throw std::runtime_error(string::Sprintf(args...));
+  throw std::runtime_error(string::Sprintf(args...));
 #else
-    LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << string::Sprintf(args...);
 #endif
-  }
 }
 
 #ifdef PADDLE_WITH_CUDA
 
+inline bool is_error(cudaError_t e) { return UNLIKELY(e); }
+
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cudaError_t e, const Args&... args) {
-  if (UNLIKELY(e)) {
 #ifndef REPLACE_ENFORCE_GLOG
-    throw thrust::system_error(e, thrust::cuda_category(),
-                               string::Sprintf(args...));
+  throw thrust::system_error(e, thrust::cuda_category(),
+                             string::Sprintf(args...));
 #else
-    LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << string::Sprintf(args...);
 #endif
-  }
+}
+
+inline bool is_error(curandStatus_t stat) {
+  return stat != CURAND_STATUS_SUCCESS;
 }
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     curandStatus_t stat, const Args&... args) {
-  if (stat != CURAND_STATUS_SUCCESS) {
 #ifndef REPLACE_ENFORCE_GLOG
-    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
-                               string::Sprintf(args...));
+  throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                             string::Sprintf(args...));
 #else
-    LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << string::Sprintf(args...);
 #endif
-  }
+}
+
+inline bool is_error(cudnnStatus_t stat) {
+  return stat != CUDNN_STATUS_SUCCESS;
 }
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cudnnStatus_t stat, const Args&... args) {
-  if (stat == CUDNN_STATUS_SUCCESS) {
-    return;
-  } else {
 #ifndef REPLACE_ENFORCE_GLOG
-    throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
-                             string::Sprintf(args...));
+  throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
+                           string::Sprintf(args...));
 #else
-    LOG(FATAL) << string::Sprintf(args...);
+  LOG(FATAL) << string::Sprintf(args...);
 #endif
-  }
+}
+
+inline bool is_error(cublasStatus_t stat) {
+  return stat != CUBLAS_STATUS_SUCCESS;
 }
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cublasStatus_t stat, const Args&... args) {
   std::string err;
-  if (stat == CUBLAS_STATUS_SUCCESS) {
-    return;
-  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+  if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
     err = "CUBLAS: not initialized, ";
   } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
     err = "CUBLAS: alloc failed, ";
@@ -254,21 +258,49 @@ inline void throw_on_error(T e) {
 #define PADDLE_THROW(...) \
   throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
 
+#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_;
+
+#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \
+  ::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG));
+
+#define __PADDLE_THROW_ON_ERROR(COND, ...)                                \
+  __PADDLE_THROW_ERROR_I(                                                 \
+      __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      ::paddle::platform::throw_on_error(COND, __VA_ARGS__),              \
+      __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__))
+
+#define __PADDLE_UNARY_COMPARE(COND, ...)                 \
+  do {                                                    \
+    auto __cond = COND;                                   \
+    if (UNLIKELY(::paddle::platform::is_error(__cond))) { \
+      __PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__);       \
+    }                                                     \
+  } while (0)
+
 #ifndef REPLACE_ENFORCE_GLOG
-#define PADDLE_ENFORCE(...)                                             \
+#define __PADDLE_ENFORCE_I(COND, ...)                                   \
   do {                                                                  \
     try {                                                               \
-      ::paddle::platform::throw_on_error(__VA_ARGS__);                  \
+      __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);                        \
     } catch (...) {                                                     \
       throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
                                               __FILE__, __LINE__);      \
     }                                                                   \
-  } while (false)
+  } while (0)
 
 #else
-#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
+#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG
 
+#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args
+#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__))
+
 #define PADDLE_THROW_EOF()                                                     \
   do {                                                                         \
     throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index d521829655..1091badae5 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -37,6 +37,25 @@ TEST(ENFORCE, FAILED) {
         HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all"));
   }
   EXPECT_TRUE(caught_exception);
+
+  caught_exception = false;
+  try {
+    PADDLE_ENFORCE(false, "Enforce is not ok at all");
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "Enforce is not ok at all"));
+  }
+  EXPECT_TRUE(caught_exception);
+
+  caught_exception = false;
+  try {
+    PADDLE_ENFORCE(false);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_NE(std::string(error.what()).find("  at "), 0);
+  }
+  EXPECT_TRUE(caught_exception);
 }
 
 TEST(ENFORCE, NO_ARG_OK) {
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index 27e930e6e0..3a937dfaec 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #include <vector>
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/init.h"
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index e2b7ca9b03..b1b51d804e 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <bitset>
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 584df85e80..b3d20736a8 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -145,7 +145,8 @@ class MKLDNNHandler {
       const std::shared_ptr<mkldnn::memory> user_memory_p,
       const std::string& suffix,
       std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false) {
+      bool is_persistent = false, bool is_INT8 = false,
+      std::vector<float> scale_data = {1.0f}, int mask = 0) {
     // create reorder primitive if the input format is not the preferred one
     auto local_key = key_ + suffix;
     auto key_reorder_p = key_ + suffix + "reorder_p";
@@ -159,8 +160,20 @@ class MKLDNNHandler {
       std::shared_ptr<mkldnn::primitive> reorder_p;
       if (mpd != user_mpd) {
         target_memory_p = std::make_shared<mkldnn::memory>(mpd);
-        auto reorder_p =
-            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+        std::shared_ptr<mkldnn::reorder> reorder_p;
+        if (is_INT8) {
+          mkldnn::primitive_attr
+              attri;  // attribute for int8 weights and bias data reorder.
+          attri.set_output_scales(mask, scale_data);
+
+          auto reorder_pd = std::shared_ptr<mkldnn::reorder::primitive_desc>(
+              new mkldnn::reorder::primitive_desc(user_mpd, mpd, attri));
+          reorder_p = std::shared_ptr<mkldnn::reorder>(new mkldnn::reorder(
+              *reorder_pd, *user_memory_p, *target_memory_p));
+        } else {
+          reorder_p = std::make_shared<mkldnn::reorder>(*user_memory_p,
+                                                        *target_memory_p);
+        }
         dev_ctx_.SetBlob(key_reorder_p, reorder_p);
         pipeline.push_back(*reorder_p);
       }
@@ -182,22 +195,58 @@ class MKLDNNHandler {
     return dims2str(operand_dims) + suffix;
   }
 
-  template <typename M>
+  template <typename T>
   static void SetDstMemory(
       const framework::ExecutionContext& ctx, framework::Tensor* output,
       std::vector<int> dst_tz, const mkldnn::engine& engine,
       std::shared_ptr<mkldnn::memory::primitive_desc>& dst_pd,  // NOLINT
       std::shared_ptr<mkldnn::memory>& dst_memory) {            // NOLINT
-    M* output_data = output->mutable_data<M>(ctx.GetPlace());
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
     auto dst_md = platform::MKLDNNMemDesc(
         {dst_tz}, paddle::framework::ToMKLDNNDataType(
-                      framework::DataTypeTrait<M>::DataType),
+                      framework::DataTypeTrait<T>::DataType),
         mkldnn::memory::format::nhwc);
     dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine));
-    dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<M>(output_data)));
+    dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<T>(output_data)));
+  }
+
+  static void AppendKey(
+      std::string* key, const mkldnn::memory::dims& input_dims,
+      const mkldnn::memory::dims& weights_dims, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const std::vector<int>& dilations,
+      const int& groups, const mkldnn::memory::data_type& srcdt,
+      const mkldnn::memory::format& format,
+      const mkldnn::memory::data_type& dstdt, const std::string& suffix) {
+    AppendKeyDims(key, input_dims);
+    AppendKeyDims(key, weights_dims);
+    AppendKeyVec(key, strides);
+    AppendKeyVec(key, paddings);
+    AppendKeyVec(key, dilations);
+    AppendKey(key, std::to_string(groups));
+    AppendKey(key, std::to_string(srcdt));
+    AppendKey(key, std::to_string(format));
+    AppendKey(key, std::to_string(dstdt));
+    AppendKey(key, suffix);
   }
 
  protected:
+  static void AppendKeyDims(std::string* key,
+                            const mkldnn::memory::dims& dims) {
+    for (unsigned int i = 0; i < dims.size(); i++) {
+      AppendKey(key, std::to_string(dims[i]));
+    }
+  }
+
+  static void AppendKeyVec(std::string* key, const std::vector<int>& dims) {
+    for (unsigned int i = 0; i < dims.size(); i++) {
+      AppendKey(key, std::to_string(dims[i]));
+    }
+  }
+
+  static void AppendKey(std::string* key, const std::string& s) {
+    key->append(s);
+  }
+
   static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
     std::string dstr = "";
     for (size_t i = 0; i < operand_dims.size(); ++i) {
@@ -215,7 +264,8 @@ class MKLDNNHandler {
 
 class TransposeMKLDNNHandler : public MKLDNNHandler {
  public:
-  TransposeMKLDNNHandler(std::vector<int>& dims, std::vector<int>& axis,
+  TransposeMKLDNNHandler(std::vector<int>& dims,  // NOLINT
+                         std::vector<int>& axis,  // NOLINT
                          const platform::MKLDNNDeviceContext& dev_ctx,
                          mkldnn::engine engine, const std::string& base_key)
       : platform::MKLDNNHandler(dev_ctx, engine, base_key),
@@ -303,8 +353,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
   }
 
  protected:
-  mkldnn_memory_desc_t Axis2MemoryDesc(std::vector<int>& nchw_tz,
-                                       std::vector<int>& axis) {
+  mkldnn_memory_desc_t Axis2MemoryDesc(std::vector<int>& nchw_tz,  // NOLINT
+                                       std::vector<int>& axis      // NOLINT
+                                       ) {
     mkldnn_memory_desc_t mem_fmt;
 
     mem_fmt.primitive_kind = mkldnn_memory;
@@ -462,21 +513,26 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
       std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false) {
+      bool is_persistent = false, bool is_INT8 = false,
+      std::vector<float> scale_data = {1.0f}, int mask = 0) {
     auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
     auto weights_pd = conv_pd_->weights_primitive_desc();
-    return this->AcquireMemory(weights_pd, user_weights_pd,
-                               user_weights_memory_p, "@weights_mem_p",
-                               pipeline, is_persistent);
+    return this->AcquireMemory(
+        weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p",
+        pipeline, is_persistent, is_INT8, scale_data, mask);
   }
 
   std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false, bool is_INT8 = false,
+      std::vector<float> scale_data = {1.0f},
+      int mask = 0) {  // NOLINT
     auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
     auto bias_pd = conv_pd_->bias_primitive_desc();
     return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
-                               "@bias_mem_p", pipeline);
+                               "@bias_mem_p", pipeline, is_persistent, is_INT8,
+                               scale_data, mask);
   }
 
   std::shared_ptr<forward_t> AcquireConvolution(
@@ -594,5 +650,29 @@ using ConvTransposeMKLDNNHandler =
     ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward,
                               mkldnn::deconvolution_backward_data,
                               mkldnn::deconvolution_backward_weights>;
+
+template <typename T>
+static std::shared_ptr<mkldnn::memory> SetDstMemory(
+    const framework::ExecutionContext& ctx, framework::Tensor* output,
+    const std::shared_ptr<ConvMKLDNNHandler>& handler) {
+  T* output_data = output->mutable_data<T>(
+      ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
+      handler->GetDstMemorySize());
+  std::shared_ptr<mkldnn::memory> dst_memory_p =
+      handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
+  return dst_memory_p;
+}
+
+template <typename T>
+static std::shared_ptr<mkldnn::memory> SetDstMemoryHandler(
+    const framework::ExecutionContext& ctx, framework::Tensor* output,
+    const std::shared_ptr<ConvMKLDNNHandler>& handler) {
+  T* output_data = output->mutable_data<T>(
+      ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
+      handler->GetDstMemorySize());
+  std::shared_ptr<mkldnn::memory> dst_memory_p;
+  dst_memory_p->set_data_handle(to_void_cast<T>(output_data));
+  return dst_memory_p;
+}
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 6ce4bf8f13..8df8e32098 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -106,7 +106,7 @@ struct NCCLContextMap {
     }
     std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
     // if num_trainers == 1, should create a new nccl id for local comms.
-    if (num_trainers == 1) {
+    if (num_trainers == 1 && nccl_id == nullptr) {
       std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
       PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
           comms.get(), static_cast<int>(order_.size()), order_.data()));
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 998242fb4a..85977366e6 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/platform/port.h"
-
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -25,9 +22,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/platform/device_tracer.h"
+#include "paddle/fluid/platform/port.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 
 DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
@@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
 
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
-  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
+  std::lock_guard<std::mutex> l(profiler_mu);
+
   is_enabled_ = true;
   dev_ctx_ = dev_ctx;
   name_ = name;
@@ -184,8 +185,8 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
 }
 
 RecordEvent::~RecordEvent() {
-  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
+  std::lock_guard<std::mutex> l(profiler_mu);
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc
index e4e5be5b89..35d1d92981 100644
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -14,12 +14,27 @@
 
 #include "paddle/fluid/platform/temporary_allocator.h"
 #include <gtest/gtest.h>
+#include <string>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor_util.h"
+
 DECLARE_double(limit_of_temporary_allocation);
 
 namespace paddle {
 namespace platform {
 
+class DummyOp : public framework::OperatorBase {
+ public:
+  DummyOp(const std::string& type, const framework::VariableNameMap& inputs,
+          const framework::VariableNameMap& outputs,
+          const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {}
+};
+
 TEST(temporary_allocator, temporary_allocator) {
   platform::CPUPlace cpu_place;
   TemporaryAllocator alloc(cpu_place);
@@ -68,96 +83,92 @@ TEST(temporary_allocator, add_callback) {
 }
 
 TEST(temporary_allocator, create_tensor_with_allocationptr) {
-  platform::CPUPlace cpu_place;
-  TemporaryAllocator cpu_alloc(cpu_place);
+  framework::VariableNameMap dummy_vars;
+  framework::AttributeMap dummy_attrs;
+  DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs);
+  framework::Scope scope;
+  framework::VariableValueMap vars;
+  framework::RuntimeContext run_ctx(vars, vars);
+  size_t memory_size = 300;
   {
-    size_t memory_size = 200;
-    auto allocation = cpu_alloc.Allocate(memory_size);
-    void* address = allocation->ptr();
+    platform::CPUPlace cpu_place;
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx =
+        static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+
     int numel = memory_size / sizeof(float);
-    framework::Tensor tensor = framework::GetTensor<float>(
-        std::move(allocation), framework::make_ddim({numel}));
-    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+    framework::Tensor tensor =
+        ctx.AllocateTmpTensor<float, platform::CPUDeviceContext>(
+            framework::make_ddim({numel}), *dev_ctx);
     PADDLE_ENFORCE_EQ(tensor.numel(), numel);
   }
 
 #ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu_place(0);
-  TemporaryAllocator gpu_alloc(gpu_place);
-
   {
-    size_t memory_size = 300;
-    auto allocation = gpu_alloc.Allocate(memory_size);
-    void* address = allocation->ptr();
+    platform::CUDAPlace gpu_place(0);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx =
+        static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
     int numel = memory_size / sizeof(float);
-    framework::Tensor tensor = framework::GetTensor<float>(
-        std::move(allocation), framework::make_ddim({numel}));
-    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+    framework::Tensor tensor =
+        ctx.AllocateTmpTensor<float, platform::CUDADeviceContext>(
+            framework::make_ddim({numel}), *dev_ctx);
     PADDLE_ENFORCE_EQ(tensor.numel(), numel);
   }
-
-  // The allocation is not holded now, it should be placed to
-  // TemporaryAllocationQueue.
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
-  gpu_alloc.Release([]() {});
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
 #endif
 }
 
 TEST(temporary_allocator, create_tensor_with_allocationptr2) {
-  platform::CPUPlace cpu_place;
-  TemporaryAllocator cpu_alloc(cpu_place);
+  framework::VariableNameMap dummy_vars;
+  framework::AttributeMap dummy_attrs;
+  DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs);
+  framework::Scope scope;
+  framework::VariableValueMap vars;
+  framework::RuntimeContext run_ctx(vars, vars);
+  size_t memory_size = 400;
   {
-    size_t memory_size = 400;
+    platform::CPUPlace cpu_place;
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx =
+        static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
     int numel = memory_size / sizeof(float);
 
     framework::Tensor out_side_tensor;
-    void* address;
     {
-      auto allocation = cpu_alloc.Allocate(memory_size);
-      address = allocation->ptr();
-      framework::Tensor tensor = framework::GetTensor<float>(
-          std::move(allocation), framework::make_ddim({numel}));
-      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+      framework::Tensor tensor =
+          ctx.AllocateTmpTensor<float, platform::CPUDeviceContext>(
+              framework::make_ddim({numel}), *dev_ctx);
       PADDLE_ENFORCE_EQ(tensor.numel(), numel);
 
       out_side_tensor.ShareDataWith(tensor);
     }
-    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
     PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
   }
 
 #ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu_place(0);
-  TemporaryAllocator gpu_alloc(gpu_place);
   {
-    void* address;
+    platform::CUDAPlace gpu_place(0);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx =
+        static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
+
     size_t memory_size = 500;
     int numel = memory_size / sizeof(float);
     framework::Tensor out_side_tensor;
     {
-      auto allocation = gpu_alloc.Allocate(memory_size);
-      address = allocation->ptr();
-      framework::Tensor tensor = framework::GetTensor<float>(
-          std::move(allocation), framework::make_ddim({numel}));
-      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+      framework::Tensor tensor =
+          ctx.AllocateTmpTensor<float, platform::CUDADeviceContext>(
+              framework::make_ddim({numel}), *dev_ctx);
       PADDLE_ENFORCE_EQ(tensor.numel(), numel);
 
       out_side_tensor.ShareDataWith(tensor);
     }
-    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
     PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
-    // The allocation is holded by out_side_tensor.
-    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
-    gpu_alloc.Release([]() {});
-    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
   }
-
-  // The allocation is not holded now, it should be placed to
-  // TemporaryAllocationQueue.
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
-  gpu_alloc.Release([]() {});
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
 #endif
 }
 
diff --git a/paddle/fluid/platform/timer.cc b/paddle/fluid/platform/timer.cc
new file mode 100644
index 0000000000..75d4e5cbf9
--- /dev/null
+++ b/paddle/fluid/platform/timer.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/timer.h"
+
+namespace paddle {
+namespace platform {
+
+void Timer::Reset() {
+  _start.tv_sec = 0;
+  _start.tv_usec = 0;
+
+  _count = 0;
+  _elapsed = 0;
+  _paused = true;
+}
+
+void Timer::Start() {
+  Reset();
+  Resume();
+}
+
+void Timer::Pause() {
+  if (_paused) {
+    return;
+  }
+  _elapsed += Tickus();
+  ++_count;
+  _paused = true;
+}
+
+void Timer::Resume() {
+  gettimeofday(&_start, NULL);
+  _paused = false;
+}
+
+int Timer::Count() { return _count; }
+
+double Timer::ElapsedUS() { return static_cast<double>(_elapsed); }
+
+double Timer::ElapsedMS() { return _elapsed / 1000.0; }
+
+double Timer::ElapsedSec() { return _elapsed / 1000000.0; }
+
+int64_t Timer::Tickus() {
+  gettimeofday(&_now, NULL);
+  return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L +
+         (_now.tv_usec - _start.tv_usec);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h
new file mode 100644
index 0000000000..56019ae7cf
--- /dev/null
+++ b/paddle/fluid/platform/timer.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdlib.h>
+#include "paddle/fluid/platform/port.h"
+
+#ifdef _WIN32
+static unsigned sleep(unsigned seconds) {
+  Sleep(seconds * 1000);
+  return 0;
+}
+#endif
+
+namespace paddle {
+namespace platform {
+
+// A Standard Timer implementation for debugging
+class Timer {
+ public:
+  // a timer class for profiling
+  // Reset() will be called during initialization
+  // all timing variables will be set 0 in Reset()
+  Timer() { Reset(); }
+  void Reset();
+  void Start();
+  void Pause();
+  // Resume will get current system time
+  void Resume();
+  int Count();
+  // return elapsed time in us
+  double ElapsedUS();
+  // return elapsed time in ms
+  double ElapsedMS();
+  // return elapsed time in sec
+  double ElapsedSec();
+
+ private:
+  struct timeval _start;
+  struct timeval _now;
+  int _count;
+  int _elapsed;
+  bool _paused;
+
+  // get us difference between start and now
+  int64_t Tickus();
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/timer_test.cc b/paddle/fluid/platform/timer_test.cc
new file mode 100644
index 0000000000..09edf8131f
--- /dev/null
+++ b/paddle/fluid/platform/timer_test.cc
@@ -0,0 +1,45 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/platform/timer.h"
+#include "gtest/gtest.h"
+
+TEST(Timer, Reset) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+  timeline.Reset();
+}
+
+TEST(Timer, Start) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+}
+
+TEST(Timer, Pause) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+}
+
+TEST(Timer, Resume) {
+  paddle::platform::Timer timeline;
+  timeline.Start();
+  sleep(3);
+  timeline.Pause();
+  timeline.Resume();
+}
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index fb8bcb190b..72b0f216d3 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer)
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool)
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
 endif()
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index f8ded9f94e..06d8b65fb1 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -49,9 +49,6 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpNameScopeAttrName",
       framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
-  op_proto_and_checker_maker.def(
-      "kOpCreationCallstackAttrName",
-      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index be63fb8778..5c1c7478f4 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/imperative/tracer.h"
 
 namespace paddle {
@@ -24,13 +23,10 @@ namespace pybind {
 void BindTracer(pybind11::module *m) {
   pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
       .def("__init__",
-           [](imperative::Tracer &self, framework::BlockDesc *root_block,
-              framework::BlockDesc *startup_block) {
-             new (&self) imperative::Tracer(root_block, startup_block);
+           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+             new (&self) imperative::Tracer(root_block);
            })
-      .def("trace", &imperative::Tracer::Trace)
-      .def("get_scope", &imperative::Tracer::GetScope,
-           pybind11::return_value_policy::reference);
+      .def("trace", &imperative::Tracer::Trace);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 88a2a5276a..dce755c91a 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -32,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope_pool.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -83,11 +84,15 @@ bool IsCompiledWithCUDA() {
 }
 
 bool IsCompiledWithBrpc() {
-#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA)
-  return true;
-#else
+#ifndef PADDLE_WITH_DISTRIBUTE
+  return false;
+#endif
+
+#ifdef PADDLE_WITH_GRPC
   return false;
 #endif
+
+  return true;
 }
 
 bool IsCompiledWithDIST() {
@@ -117,20 +122,42 @@ PYBIND11_MODULE(core, m) {
         return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj);
       });
 
-  py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
-      .def(py::init<>())
+  m.add_object("_cleanup",
+               py::capsule([]() { ScopePool::Instance().Clear(); }));
+
+  py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
+      m, "VarBase", R"DOC()DOC")
+      // .def(py::init<>())
+      .def(py::init<bool>(), py::arg("stop_gradient") = false)
       .def("_run_backward",
-           [](imperative::VarBase &self, framework::Scope *scope) {
-             self.RunBackward(scope);
-           })
+           [](imperative::VarBase &self) { self.RunBackward(); })
+      .def("_grad_name", &imperative::VarBase::GradName)
       .def("_grad", &imperative::VarBase::Grad)
+      .def_property("grad_value",
+                    [](const imperative::VarBase &self) { return self.grads_; },
+                    [](imperative::VarBase &self, framework::Variable *grad) {
+                      self.grads_ = grad;
+                    },
+                    py::return_value_policy::reference)
+      .def_property("value",
+                    [](const imperative::VarBase &self) { return self.var_; },
+                    [](imperative::VarBase &self, framework::Variable *var) {
+                      self.var_ = var;
+                    },
+                    py::return_value_policy::reference)
       .def_property(
           "desc",
           [](const imperative::VarBase &self) { return self.var_desc_; },
           [](imperative::VarBase &self, framework::VarDesc *var_desc) {
             self.var_desc_ = var_desc;
           },
-          py::return_value_policy::reference);
+          py::return_value_policy::reference)
+      .def_property(
+          "stop_gradient",
+          [](const imperative::VarBase &self) { return self.stop_gradient_; },
+          [](imperative::VarBase &self, bool stop_gradient) {
+            self.stop_gradient_ = stop_gradient;
+          });
 
   py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
       .def(py::init<>())
@@ -454,7 +481,7 @@ All parameter, weight, gradient are variables in Paddle.
             },
         py::return_value_policy::copy);
 
-  py::class_<Scope>(m, "Scope", R"DOC(
+  py::class_<Scope>(m, "_Scope", R"DOC(
     Scope is an association of a name to Variable. All variables belong to Scope.
 
     Variables in a parent scope can be retrieved from local scope.
@@ -474,17 +501,26 @@ All parameter, weight, gradient are variables in Paddle.
           param.set(param_array, place)
 
         )DOC")
+      .def("_remove_from_pool",
+           [](Scope &self) { ScopePool::Instance().Remove(&self); })
       .def("var",
            [](Scope &self, const std::string &name) -> Variable * {
              return self.Var(name);
            },
            py::return_value_policy::reference)
       .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
-      .def(py::init<>())
       .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
            py::return_value_policy::reference)
       .def("drop_kids", &Scope::DropKids);
 
+  m.def("Scope",
+        []() -> Scope * {
+          auto *s = new Scope();
+          ScopePool::Instance().Insert(std::unique_ptr<Scope>(s));
+          return s;
+        },
+        py::return_value_policy::reference);
+
   //! @note: Be careful! PyBind will return std::string as an unicode, not
   //! Python str. If you want a str object, you should cast them in Python.
   m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
@@ -910,13 +946,6 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is STR, debug_graphviz_path indicate the path that
                     writing the SSA Graph to file in the form of graphviz, you.
                     It is useful for debugging. Default "")DOC")
-      .def_property(
-          "enable_data_balance",
-          [](const BuildStrategy &self) { return self.enable_data_balance_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
-            self.enable_data_balance_ = b;
-          })  // FIXME(chengudo): enable_data_balance seems not important
       .def_property(
           "enable_sequential_execution",
           [](const BuildStrategy &self) {
@@ -971,6 +1000,10 @@ All parameter, weight, gradient are variables in Paddle.
           "memory_optimize",
           [](const BuildStrategy &self) { return self.memory_optimize_; },
           [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; })
+      .def_property(
+          "is_distribution",
+          [](const BuildStrategy &self) { return self.is_distribution_; },
+          [](BuildStrategy &self, bool b) { self.is_distribution_ = b; })
       .def_property(
           "memory_early_delete",
           [](const BuildStrategy &self) { return self.memory_early_delete_; },
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index a2eec6e3c4..0b94b60018 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -87,7 +87,7 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
 template <typename... Args>
 std::string Sprintf(const Args&... args) {
   std::ostringstream oss;
-  Fprintf(oss, "");
+  Fprintf(oss, "%s", args...);
   return oss.str();
 }
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 418dc13468..50b7a63129 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 #=================================================
 #                   Utils
 #=================================================
@@ -200,6 +199,7 @@ function cmake_gen() {
         -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
+        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -233,7 +233,8 @@ EOF
         -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
         -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
         -DPY_VERSION=${PY_VERSION:-2.7} \
-        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
+        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
+        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
 
 }
 
@@ -418,13 +419,6 @@ EOF
         else
             ctest --output-on-failure
         fi
-
-        # make install should also be test when unittest
-        make install -j `nproc`
-        pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
-        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
-            paddle version
-        fi
     fi
 }
 
@@ -455,7 +449,7 @@ EOF
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
-      
+
         if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
             paddle version
         fi
@@ -535,6 +529,18 @@ function assert_api_spec_approvals() {
         fi
     fi
 
+    pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl
+    CHECK_DOCK_MD5=`python ${PADDLE_ROOT}/tools/check_doc_approval.py`
+    if [ "True" != ${CHECK_DOCK_MD5} ]; then
+        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
+        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+        if [ "${APPROVALS}" == "FALSE" ]; then
+            echo "You must have shanyi15 approval for the api doc change! "
+            exit 1
+        fi
+        echo ${CHECK_DOCK_MD5} >/root/.cache/doc_md5.txt
+    fi
 }
 
 
@@ -922,6 +928,7 @@ function main() {
         ;;
       assert_api)
         assert_api_not_changed ${PYTHON_ABI:-""}
+        assert_api_spec_approvals
         ;;
       test_inference)
         gen_capi_package
@@ -946,6 +953,15 @@ function main() {
         run_test
         assert_api_not_changed ${PYTHON_ABI:-""}
         ;;
+      cmake_gen)
+        cmake_gen ${PYTHON_ABI:-""}
+        ;;
+      gen_fluid_lib)
+        gen_fluid_lib
+        ;;
+      test_fluid_lib)
+        test_fluid_lib
+        ;;
       *)
         print_usage
         exit 0
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index ef43d13e18..47c5248b57 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -28,20 +28,53 @@ int main(int argc, char** argv) {
   for (int i = 0; i < argc; ++i) {
     new_argv.push_back(argv[i]);
   }
+
+  std::vector<std::string> envs;
+  std::vector<std::string> undefok;
+#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC)
+  envs.push_back("max_body_size");
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  new_argv.push_back(
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy"));
+  envs.push_back("fraction_of_gpu_memory_to_use");
+  envs.push_back("allocator_strategy");
 #elif __clang__
-  new_argv.push_back(
-      strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_"
-             "mb,allocator_strategy"));
-  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
+  envs.push_back("use_mkldnn");
+  envs.push_back("initial_cpu_memory_in_mb");
+  envs.push_back("allocator_strategy");
+
+  undefok.push_back("use_mkldnn");
+  undefok.push_back("initial_cpu_memory_in_mb");
 #else
-  new_argv.push_back(
-      strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_"
-             "mb,allocator_strategy"));
-  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
+  envs.push_back("use_pinned_memory");
+  envs.push_back("use_mkldnn");
+  envs.push_back("initial_cpu_memory_in_mb");
+  envs.push_back("allocator_strategy");
+
+  undefok.push_back("use_mkldnn");
+  undefok.push_back("initial_cpu_memory_in_mb");
 #endif
+
+  if (envs.size() > 0) {
+    std::string env_string = "--tryfromenv=";
+    for (auto t : envs) {
+      env_string += t + ",";
+    }
+    env_string = env_string.substr(0, env_string.length() - 1);
+    new_argv.push_back(strdup(env_string.c_str()));
+    VLOG(1) << "gtest env_string:" << env_string;
+  }
+
+  if (undefok.size() > 0) {
+    std::string undefok_string = "--undefok=";
+    for (auto t : undefok) {
+      undefok_string += t + ",";
+    }
+    undefok_string = undefok_string.substr(0, undefok_string.length() - 1);
+    new_argv.push_back(strdup(undefok_string.c_str()));
+    VLOG(1) << "gtest undefok_string:" << undefok_string;
+  }
+
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
   google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 8f3660ca38..f9f3807b15 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -46,7 +46,7 @@ from . import transpiler
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
+from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
@@ -102,13 +102,6 @@ def __bootstrap__():
     import sys
     import os
     import platform
-
-    if os.name == 'nt':
-        third_lib_path = os.path.abspath(os.path.dirname(
-            __file__)) + os.sep + '..' + os.sep + 'libs'
-        os.environ['path'] += ';' + third_lib_path
-        sys.path.append(third_lib_path)
-
     from . import core
 
     in_test = 'unittest' in sys.modules
@@ -135,7 +128,8 @@ def __bootstrap__():
         'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
         'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
         'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir'
+        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
+        'enable_parallel_graph'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
@@ -151,12 +145,17 @@ def __bootstrap__():
         read_env_flags.append('rpc_get_thread_num')
         read_env_flags.append('rpc_prefetch_thread_num')
         read_env_flags.append('rpc_disable_reuse_port')
+        if core.is_compiled_with_brpc():
+            read_env_flags.append('max_body_size')
+            #set brpc max body size
+            os.environ['FLAGS_max_body_size'] = "2147483647"
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus'
+            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
+            'cudnn_exhaustive_search_times', 'sync_nccl_allreduce'
         ]
 
     core.init_gflags([sys.argv[0]] +
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index af02721eb7..c280ff21ee 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -272,8 +272,7 @@ class DataFeeder(object):
             dict: the result of conversion.
 
         Raises:
-            ValueError: If drop_last is False and the data batch which cannot
-            fit for devices.
+            ValueError: If drop_last is False and the data batch which cannot fit for devices.
         """
 
         def __reader_creator__():
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index f2886090d7..5a9e908b61 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -191,7 +191,7 @@ def _fetch_var(name, scope=None, return_numpy=True):
     assert isinstance(name, str)
     if scope is None:
         scope = global_scope()
-    assert isinstance(scope, core.Scope)
+    assert isinstance(scope, core._Scope)
 
     var = scope.find_var(name)
     assert var is not None, (
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 3427fb0c4a..70767c962f 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -15,18 +15,24 @@
 from __future__ import print_function
 
 import collections
+from collections import defaultdict
 import contextlib
 import os
 import re
 import six
-import sys
-import traceback
 
 import numpy as np
 
 from .. import compat as cpt
 from .proto import framework_pb2
 try:
+    if os.name == 'nt':
+        import sys
+        third_lib_path = os.path.abspath(os.path.dirname(
+            __file__)) + os.sep + '..' + os.sep + 'libs'
+        os.environ['path'] += ';' + third_lib_path
+        sys.path.append(third_lib_path)
+
     from . import core
 except ImportError as e:
     if os.name == 'nt':
@@ -368,19 +374,26 @@ class Variable(object):
         if _in_imperative_mode():
             self._ivar = core.VarBase()
             self._ivar.desc = self.desc
+            self._ivar.stop_gradient = stop_gradient
 
     def _numpy(self):
-        scope = _imperative_tracer().get_scope(self.block.desc)
-        tensor = core.get_variable_tensor(scope, self.desc.name())
+        tensor = self._ivar.value.get_tensor()
         return np.array(tensor)
 
     def _backward(self):
-        scope = _imperative_tracer().get_scope(self.block.desc)
-        self._ivar._run_backward(scope)
+        self._ivar._run_backward()
 
     def _gradient(self):
         return np.array(self._ivar._grad())
 
+    @property
+    def _value(self):
+        return self._ivar.value
+
+    @_value.setter
+    def _value(self, v):
+        self._ivar.value = v
+
     def __str__(self):
         return self.to_string(True)
 
@@ -424,6 +437,14 @@ class Variable(object):
         """
         self.desc = input
 
+    @property
+    def _stop_gradient(self):
+        return self._ivar.stop_gradient
+
+    @_stop_gradient.setter
+    def _stop_gradient(self, s):
+        self._ivar.stop_gradient = s
+
     @property
     def persistable(self):
         return self.desc.persistable()
@@ -605,10 +626,6 @@ class Operator(object):
         if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
             del op_attrs[role_var_name]
 
-        callstack_var_name = op_maker.kOpCreationCallstackAttrName()
-        op_attrs[callstack_var_name] = list(
-            reversed(traceback.format_stack()))[1:]
-
         if len(self.desc.type()) != 0:
             return
         if type is None:
@@ -653,20 +670,16 @@ class Operator(object):
                     self.desc.set_input(in_proto.name, [])
 
         if outputs is not None:
-            given = set()
-            need = set()
-            for n in outputs:
-                given.add(n)
             for m in proto.outputs:
-                need.add(m.name)
-            if not given == need:
-                raise ValueError(("Incorrect setting for output(s) of "
-                                  "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type,
-                                  ", ".join(six.binary_type(e) for e in need),
-                                  ", ".join(six.binary_type(e) for e in given)))
-
+                if (m.name not in outputs) and m.dispensable:
+                    continue
+                if not ((m.name in outputs) or m.dispensable):
+                    raise ValueError(
+                        ("Incorrect setting for output(s) of "
+                         "operator \"%s\", should set: [%s].") % (type, m.name))
             for out_proto in proto.outputs:
+                if out_proto.name not in outputs:
+                    continue
                 out_args = outputs[out_proto.name]
                 if not isinstance(out_args, list):
                     out_args = [out_args]
@@ -691,26 +704,28 @@ class Operator(object):
                 self._update_desc_attr(attr_name, attr_val)
 
         self.desc.check_attrs()
+
         if self._has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
+
         if _in_imperative_mode():
             self.iop = core.OpBase()
             self.iop.desc = self.desc
-            self.inputs = []
+            self.inputs = defaultdict(list)
             if inputs is not None:
-                for inp in inputs.values():
-                    if isinstance(inp, Variable):
-                        self.inputs.append(inp)
-                    elif isinstance(inp, list) or isinstance(inp, tuple):
-                        self.inputs.extend(inp[:])
-            self.outputs = []
+                for k, v in six.iteritems(inputs):
+                    if isinstance(v, Variable):
+                        self.inputs[k].append(v._ivar)
+                    elif isinstance(v, list) or isinstance(v, tuple):
+                        self.inputs[k].extend([var._ivar for var in v])
+            self.outputs = defaultdict(list)
             if outputs is not None:
-                for out in outputs.values():
-                    if isinstance(out, Variable):
-                        self.outputs.append(out)
-                    elif isinstance(out, list) or isinstance(out, tuple):
-                        self.outputs.extend(out[:])
+                for k, v in six.iteritems(outputs):
+                    if isinstance(v, Variable):
+                        self.outputs[k].append(v._ivar)
+                    elif isinstance(v, list) or isinstance(v, tuple):
+                        self.outputs[k].extend([var._ivar for var in v])
 
     def _has_kernel(self, op_type):
         return op_type not in self.OP_WITHOUT_KERNEL_SET
@@ -1276,13 +1291,22 @@ class Block(object):
             Operator: the append Operator.
         """
         op_desc = self.desc.append_op()
-        op = Operator(block=self, desc=op_desc, *args, **kwargs)
-        if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
-                                       [v._ivar for v in op.outputs], self.desc)
+        op = Operator(
+            block=self,
+            desc=op_desc,
+            type=kwargs.get("type", None),
+            inputs=kwargs.get("inputs", None),
+            outputs=kwargs.get("outputs", None),
+            attrs=kwargs.get("attrs", None))
         self.ops.append(op)
+        self._trace_op(op, kwargs.get("stop_gradient", False))
         return op
 
+    def _trace_op(self, op, stop_gradient=False):
+        if _in_imperative_mode():
+            _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
+                                       stop_gradient)
+
     def _insert_op(self, index, *args, **kwargs):
         """
         Insert a Operator according to the giving arguments.
@@ -1328,11 +1352,15 @@ class Block(object):
 
     def _prepend_op(self, *args, **kwargs):
         op_desc = self.desc._prepend_op()
-        op = Operator(self, op_desc, *args, **kwargs)
-        if _in_imperative_mode():
-            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
-                                       [v._ivar for v in op.outputs], self.desc)
+        op = Operator(
+            self,
+            op_desc,
+            type=kwargs.get("type", None),
+            inputs=kwargs.get("inputs", None),
+            outputs=kwargs.get("outputs", None),
+            attrs=kwargs.get("attrs", None))
         self.ops.insert(0, op)
+        self._trace_op(op, kwargs.get("stop_gradient", False))
         return op
 
     def _sync_with_cpp(self):
@@ -1646,8 +1674,8 @@ class Program(object):
                 parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
                 to print.
 
-        Returns
-            (str): The debug string.
+        Returns:
+            str : The debug string.
 
         Raises:
             ValueError: If any of required fields is not set and throw_on_error is
diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py
index 922308b6b1..54dc794ea6 100644
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
@@ -20,6 +20,10 @@ from .base import *
 from . import layers
 from .layers import *
 
+from . import nn
+from .nn import *
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
+__all__ += nn.__all__
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index aa48ef71aa..c04dcc7e39 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -28,8 +28,7 @@ def enabled():
 def guard():
     train = framework.Program()
     startup = framework.Program()
-    tracer = core.Tracer(train.current_block().desc,
-                         startup.current_block().desc)
+    tracer = core.Tracer(train.current_block().desc)
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
             with framework._imperative_guard(tracer):
@@ -46,8 +45,7 @@ def to_variable(value, block=None):
             name=None,
             shape=value.shape,
             dtype=value.dtype)
-        scope = framework._imperative_tracer().get_scope(block.desc)
-        var = scope.var(py_var.name)
+        var = py_var._ivar.value
         tensor = var.get_tensor()
         tensor.set(value, core.CPUPlace())
         return py_var
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 044717c319..d78d61eb3f 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -24,26 +24,21 @@ __all__ = ['PyLayer']
 
 
 class PyLayer(core.Layer):
-    def __init__(self):
-        self._built = False
-
-    def __call__(self, inputs):
-        if not isinstance(inputs, list) and not isinstance(inputs, tuple):
-            inputs = [inputs]
-
-        var_inputs = []
-        for x in inputs:
-            py_var = base.to_variable(x)
-            var_inputs.append(py_var)
-        if not self._built:
-            self._build_once(inputs)
-            self._built = True
-
-        outputs = self.forward(var_inputs)
-        return outputs
+    def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
+        self._once_built = False
+        self._dtype = dtype
 
     def _build_once(self, inputs):
         pass
 
-    def forward(self, inputs):
-        return []
+    def __call__(self, *inputs):
+        if not self._once_built:
+            self._build_once(*inputs)
+            self._once_built = True
+
+        outputs = self.forward(*inputs)
+
+        return outputs
+
+    def forward(self, *inputs):
+        raise NotImplementedError
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
new file mode 100644
index 0000000000..4f30417e99
--- /dev/null
+++ b/python/paddle/fluid/imperative/nn.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from six.moves import reduce
+
+from .. import core
+from ..layers import utils
+from . import layers
+from ..framework import Variable, OpProtoHolder
+from ..param_attr import ParamAttr
+from ..initializer import Normal, Constant
+
+__all__ = [
+    'Conv2D',
+    'Pool2D',
+    'FC',
+]
+
+
+class Conv2D(layers.PyLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=None,
+                 use_cudnn=True,
+                 act=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 name=None,
+                 dtype=core.VarDesc.VarType.FP32):
+        assert param_attr is not False, "param_attr should not be False here."
+        super(Conv2D, self).__init__(name=name, dtype=dtype)
+
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(
+            type(self).__name__,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            dtype=dtype,
+            name=name)
+
+        self._groups = groups
+        self._stride = utils.convert_to_list(stride, 2, 'stride')
+        self._padding = utils.convert_to_list(padding, 2, 'padding')
+        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
+        if not isinstance(use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+        self._use_cudnn = use_cudnn
+        self._num_channels = num_channels
+        if (self._num_channels == self._groups and
+                num_filters % self._num_channels == 0 and not self._use_cudnn):
+            self._l_type = 'depthwise_conv2d'
+        else:
+            self._l_type = 'conv2d'
+
+        if groups is None:
+            num_filter_channels = num_channels
+        else:
+            if num_channels % groups != 0:
+                raise ValueError("num_channels must be divisible by groups.")
+            num_filter_channels = num_channels // groups
+        filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
+        filter_shape = [num_filters, int(num_filter_channels)] + filter_size
+
+        def _get_default_param_initializer():
+            filter_elem_num = filter_size[0] * filter_size[1] * num_channels
+            std = (2.0 / filter_elem_num)**0.5
+            return Normal(0.0, std, 0)
+
+        self._filter_param = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=filter_shape,
+            dtype=self._dtype,
+            default_initializer=_get_default_param_initializer())
+
+        if self._use_cudnn:
+            self._helper.create_variable(
+                name="kCUDNNFwdAlgoCache",
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            self._helper.create_variable(
+                name="kCUDNNBwdDataAlgoCache",
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            self._helper.create_variable(
+                name="kCUDNNBwdFilterAlgoCache",
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+
+        self._bias_param = self._helper.create_parameter(
+            attr=self._helper.bias_attr,
+            shape=[num_filters],
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, input):
+        pre_bias = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type=self._l_type,
+            inputs={
+                'Input': input,
+                'Filter': self._filter_param,
+            },
+            outputs={"Output": pre_bias},
+            attrs={
+                'strides': self._stride,
+                'paddings': self._padding,
+                'dilations': self._dilation,
+                'groups': self._groups,
+                'use_cudnn': self._use_cudnn,
+                'use_mkldnn': False,
+            })
+
+        pre_act = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type='elementwise_add',
+            inputs={'X': [pre_bias],
+                    'Y': [self._bias_param]},
+            outputs={'Out': [pre_act]},
+            attrs={'axis': 1})
+
+        return self._helper.append_activation(pre_act)
+
+
+class Pool2D(layers.PyLayer):
+    def __init__(self,
+                 pool_size=-1,
+                 pool_type="max",
+                 pool_stride=1,
+                 pool_padding=0,
+                 global_pooling=False,
+                 use_cudnn=True,
+                 ceil_mode=False,
+                 exclusive=True,
+                 name=None,
+                 dtype=core.VarDesc.VarType.FP32):
+        if pool_type not in ["max", "avg"]:
+            raise ValueError(
+                "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+                str(pool_type))
+
+        if global_pooling is False and pool_size == -1:
+            raise ValueError(
+                "When the global_pooling is False, pool_size must be passed "
+                "and be a valid value. Received pool_size: " + str(pool_size))
+
+        if not isinstance(use_cudnn, bool):
+            raise ValueError("use_cudnn should be True or False")
+
+        super(Pool2D, self).__init__(name=name, dtype=dtype)
+
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(type(self).__name__, dtype=dtype, name=name)
+
+        self._pool_type = pool_type
+        self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
+        self._pool_padding = utils.convert_to_list(pool_padding, 2,
+                                                   'pool_padding')
+        self._pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
+        self._global_pooling = global_pooling
+        self._use_cudnn = use_cudnn
+        self._ceil_mode = ceil_mode
+        self._exclusive = exclusive
+        self._l_type = 'pool2d'
+
+    def forward(self, input):
+        pool_out = self._helper.create_variable_for_type_inference(self._dtype)
+
+        self._helper.append_op(
+            type=self._l_type,
+            inputs={"X": input},
+            outputs={"Out": pool_out},
+            attrs={
+                "pooling_type": self._pool_type,
+                "ksize": self._pool_size,
+                "global_pooling": self._global_pooling,
+                "strides": self._pool_stride,
+                "paddings": self._pool_padding,
+                "use_cudnn": self._use_cudnn,
+                "ceil_mode": self._ceil_mode,
+                "use_mkldnn": False,
+                "exclusive": self._exclusive,
+            })
+        return pool_out
+
+
+class FC(layers.PyLayer):
+    def __init__(self,
+                 size,
+                 param_attr=None,
+                 num_flatten_dims=1,
+                 dtype=core.VarDesc.VarType.FP32):
+        super(FC, self).__init__()
+        self._size = size
+        self._num_flatten_dims = num_flatten_dims
+        self._dtype = dtype
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper('FC', param_attr=param_attr)
+
+    def _build_once(self, input):
+        input_shape = input.shape
+        param_shape = [
+            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
+        ] + [self._size]
+        self._w = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, input):
+        tmp = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="mul",
+            inputs={"X": input,
+                    "Y": self._w},
+            outputs={"Out": tmp},
+            attrs={
+                "x_num_col_dims": self._num_flatten_dims,
+                "y_num_col_dims": 1
+            })
+
+        out = self._helper.create_variable_for_type_inference(self._dtype)
+        self._helper.append_op(
+            type="sum",
+            inputs={"X": [tmp]},
+            outputs={"Out": out},
+            attrs={"use_mkldnn": False})
+        return out
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 26d1f8f4d2..8a2cd4a929 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -162,7 +162,8 @@ class ConstantInitializer(Initializer):
                 "dtype": int(var.dtype),
                 "value": float(self._value),
                 'force_cpu': self._force_cpu or force_init_on_cpu()
-            })
+            },
+            stop_gradient=True)
         var.op = op
         return op
 
@@ -231,7 +232,8 @@ class UniformInitializer(Initializer):
                 "min": self._low,
                 "max": self._high,
                 "seed": self._seed
-            })
+            },
+            stop_gradient=True)
 
         if var.dtype == VarDesc.VarType.FP16:
             block.append_op(
@@ -309,7 +311,8 @@ class NormalInitializer(Initializer):
                 "std": self._std_dev,
                 "seed": self._seed,
                 "use_mkldnn": False
-            })
+            },
+            stop_gradient=True)
 
         if var.dtype == VarDesc.VarType.FP16:
             block.append_op(
@@ -371,7 +374,8 @@ class TruncatedNormalInitializer(Initializer):
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed
-            })
+            },
+            stop_gradient=True)
         var.op = op
         return op
 
@@ -461,7 +465,8 @@ class XavierInitializer(Initializer):
                     "min": -limit,
                     "max": limit,
                     "seed": self._seed
-                })
+                },
+                stop_gradient=True)
 
         else:
             std = np.sqrt(2.0 / float(fan_in + fan_out))
@@ -474,7 +479,8 @@ class XavierInitializer(Initializer):
                     "mean": 0.0,
                     "std": std,
                     "seed": self._seed
-                })
+                },
+                stop_gradient=True)
         var.op = op
         return op
 
@@ -559,7 +565,8 @@ class MSRAInitializer(Initializer):
                     "min": -limit,
                     "max": limit,
                     "seed": self._seed
-                })
+                },
+                stop_gradient=True)
 
         else:
             std = np.sqrt(2.0 / float(fan_in))
@@ -572,7 +579,8 @@ class MSRAInitializer(Initializer):
                     "mean": 0.0,
                     "std": std,
                     "seed": self._seed
-                })
+                },
+                stop_gradient=True)
         var.op = op
         return op
 
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 74b4a977db..ea9953f581 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -20,10 +20,10 @@ import six
 import sys
 import numpy as np
 
-from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
+from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode
 from . import unique_name
+from paddle.fluid.imperative import base as imperative_base
 from paddle.fluid.initializer import Constant, Xavier
-from paddle.fluid.imperative import base
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
 from six.moves import zip
@@ -50,7 +50,7 @@ class LayerHelper(object):
         return default_startup_program()
 
     def to_variable(self, x):
-        return base.to_variable(x, self.main_program.current_block())
+        return imperative_base.to_variable(x, self.main_program.current_block())
 
     def append_op(self, *args, **kwargs):
         return self.main_program.current_block().append_op(*args, **kwargs)
@@ -313,11 +313,20 @@ class LayerHelper(object):
             param = self._create_weight_normalize(attr, shape, dtype)
             WeightNormParamAttr.params_with_weight_norm.append(param)
             return param
-
-        self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True))
-        return self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr._to_kwargs())
+        if _in_imperative_mode():
+            # In imperative mode, we want the returned parameter to be
+            # initialized so that it can be used imperatively.
+            return self.main_program.global_block().create_parameter(
+                dtype=dtype,
+                shape=shape,
+                **attr._to_kwargs(with_initializer=True))
+        else:
+            self.startup_program.global_block().create_parameter(
+                dtype=dtype,
+                shape=shape,
+                **attr._to_kwargs(with_initializer=True))
+            return self.main_program.global_block().create_parameter(
+                dtype=dtype, shape=shape, **attr._to_kwargs())
 
     def get_parameter(self, name):
         param = self.main_program.global_block().var(name)
@@ -369,13 +378,16 @@ class LayerHelper(object):
 
     def set_variable_initializer(self, var, initializer):
         assert isinstance(var, Variable)
-        self.startup_program.global_block().create_var(
-            name=var.name,
-            type=var.type,
-            dtype=var.dtype,
-            shape=var.shape,
-            persistable=True,
-            initializer=initializer)
+        if imperative_base.enabled():
+            initializer(var, var.block)
+        else:
+            self.startup_program.global_block().create_var(
+                name=var.name,
+                type=var.type,
+                dtype=var.dtype,
+                shape=var.shape,
+                persistable=True,
+                initializer=initializer)
 
     def append_bias_op(self, input_var, dim_start=1, dim_end=None):
         """
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 9d98e8333b..a7494aacea 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1452,6 +1452,7 @@ class DynamicRNN(object):
     def step_input(self, x):
         """
         Mark a sequence as a dynamic RNN input.
+
         Args:
             x(Variable): The input sequence.
 
@@ -1505,6 +1506,7 @@ class DynamicRNN(object):
         """
         Mark a variable as a RNN input. The input will not be scattered into
         time steps.
+
         Args:
             x(Variable): The input variable.
 
@@ -1629,13 +1631,11 @@ class DynamicRNN(object):
         Args:
             init(Variable|None): The initialized variable.
 
-            shape(list|tuple): The memory shape. NOTE the shape does not contain
-            batch_size.
+            shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size.
 
             value(float): the initalized value.
 
-            need_reorder(bool): True if the initialized memory depends on the
-            input sample.
+            need_reorder(bool): True if the initialized memory depends on the input sample.
 
             dtype(str|numpy.dtype): The data type of the initialized memory.
 
@@ -1714,6 +1714,7 @@ class DynamicRNN(object):
         """
         Update the memory from ex_mem to new_mem. NOTE that the shape and data
         type of :code:`ex_mem` and :code:`new_mem` must be same.
+        
         Args:
             ex_mem(Variable): the memory variable.
             new_mem(Variable): the plain variable generated in RNN block.
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index ce731f39ea..8aed97dc59 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred,
                       rpn_negative_overlap=0.3,
                       use_random=True):
     """
-    ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
+    **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.**
 
     This layer can be, for given the  Intersection-over-Union (IoU) overlap
     between anchors and ground truth boxes, to assign classification and
@@ -135,19 +135,20 @@ def rpn_target_assign(bbox_pred,
     Examples:
         .. code-block:: python
 
-        bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
-                          append_batch_size=False, dtype='float32')
-        cls_logits = layers.data(name='cls_logits', shape=[100, 1],
-                          append_batch_size=False, dtype='float32')
-        anchor_box = layers.data(name='anchor_box', shape=[20, 4],
-                          append_batch_size=False, dtype='float32')
-        gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
-                         append_batch_size=False, dtype='float32')
-        loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
-            fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
-                                          cls_logits=cls_logits,
-                                          anchor_box=anchor_box,
-                                          gt_boxes=gt_boxes)
+            bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
+                              append_batch_size=False, dtype='float32')
+            cls_logits = layers.data(name='cls_logits', shape=[100, 1],
+                              append_batch_size=False, dtype='float32')
+            anchor_box = layers.data(name='anchor_box', shape=[20, 4],
+                              append_batch_size=False, dtype='float32')
+            gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
+                             append_batch_size=False, dtype='float32')
+            loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
+                fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
+                                              cls_logits=cls_logits,
+                                              anchor_box=anchor_box,
+                                              gt_boxes=gt_boxes)
+
     """
 
     helper = LayerHelper('rpn_target_assign', **locals())
@@ -1519,27 +1520,30 @@ def anchor_generator(input,
     Args:
        input(Variable): The input feature map, the format is NCHW.
        anchor_sizes(list|tuple|float): The anchor sizes of generated anchors,
-       given in absolute pixels e.g. [64., 128., 256., 512.].
-       For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
+                                       given in absolute pixels e.g. [64., 128., 256., 512.].
+                                       For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
        aspect_ratios(list|tuple|float): The height / width ratios of generated
-            anchors, e.g. [0.5, 1.0, 2.0].
+                                        anchors, e.g. [0.5, 1.0, 2.0].
        variance(list|tuple): The variances to be used in box regression deltas.
-            Default:[0.1, 0.1, 0.2, 0.2].
-       stride(list|turple): The anchors stride across width and height,
-            e.g. [16.0, 16.0]
+                             Default:[0.1, 0.1, 0.2, 0.2].
+       stride(list|turple): The anchors stride across width and height,e.g. [16.0, 16.0]
        offset(float): Prior boxes center offset. Default: 0.5
        name(str): Name of the prior box op. Default: None.
 
     Returns:
-        Anchors(Variable):  The output anchors with a layout of [H, W, num_anchors, 4].
-              H is the height of input, W is the width of input,
-              num_anchors is the box count of each position.
-              Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-        Variances(Variable): The expanded variances of anchors
-              with a layout of [H, W, num_priors, 4].
-              H is the height of input, W is the width of input
-              num_anchors is the box count of each position.
-              Each variance is in (xcenter, ycenter, w, h) format.
+        Anchors(Variable),Variances(Variable):  
+        
+              two variables:
+        
+              - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. \
+                H is the height of input, W is the width of input, \
+                num_anchors is the box count of each position.  \
+                Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. 
+              - Variances(Variable): The expanded variances of anchors \
+                with a layout of [H, W, num_priors, 4]. \
+                H is the height of input, W is the width of input \
+                num_anchors is the box count of each position. \
+                Each variance is in (xcenter, ycenter, w, h) format.
 
 
     Examples:
@@ -1748,35 +1752,35 @@ def generate_proposals(scores,
                        eta=1.0,
                        name=None):
     """
-    ** Generate proposal Faster-RCNN **
-	
-	This operation proposes RoIs according to each box with their probability to be a foreground object and 
-	the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
-	could be used to train detection net.
-
-	For generating proposals, this operation performs following steps:
-
-	1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
- 	2. Calculate box locations as proposals candidates. 
-	3. Clip boxes to image
-	4. Remove predicted boxes with small area. 
-	5. Apply NMS to get final proposals as output.
-	
-      
-	Args:
-		scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
-			N is batch size, A is number of anchors, H and W are height and width of the feature map.
-		bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
-		im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
-			between origin image size and the size of feature map.
-		anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
-              		num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-		variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
-		pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
-		post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
-		nms_thresh(float): Threshold in NMS, 0.5 by default.
-		min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
-		eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
+    **Generate proposal Faster-RCNN**
+
+    This operation proposes RoIs according to each box with their probability to be a foreground object and 
+    the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
+    could be used to train detection net.
+
+    For generating proposals, this operation performs following steps:
+
+    1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
+    2. Calculate box locations as proposals candidates. 
+    3. Clip boxes to image
+    4. Remove predicted boxes with small area. 
+    5. Apply NMS to get final proposals as output.
+
+    Args:
+        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
+            N is batch size, A is number of anchors, H and W are height and width of the feature map.
+        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
+        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
+            between origin image size and the size of feature map.
+        anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
+                    num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
+        variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
+        pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
+        post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
+        nms_thresh(float): Threshold in NMS, 0.5 by default.
+        min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
+        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
+
     """
     helper = LayerHelper('generate_proposals', **locals())
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 42f4959a83..9a29b25093 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -949,12 +949,11 @@ def shuffle(reader, buffer_size):
     is determined by argument buf_size.
 
     Args:
-        param reader: the original reader whose output will be shuffled.
-        type reader: callable
-        param buf_size: shuffle buffer size.
-        type buf_size: int
-        return: the new reader whose output is shuffled.
-        rtype: callable
+        reader(callable): the original reader whose output will be shuffled.
+        buf_size(int): shuffle buffer size.
+
+    Returns:
+        callable: the new reader whose output is shuffled.
     """
     return __create_unshared_decorated_reader__(
         'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index cc1fdbd285..615a35ba91 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -26,7 +26,7 @@ from ..initializer import Normal, Constant
 from ..framework import Variable, OpProtoHolder
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
-from .tensor import concat
+from .tensor import concat, assign
 from . import utils
 from .. import unique_name
 from functools import reduce
@@ -233,7 +233,7 @@ def fc(input,
             dimensions will be flatten to form the first dimension of the final matrix (height of
             the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
             form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
             Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
         param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
             parameters/weights of this layer.
@@ -340,9 +340,7 @@ def embedding(input,
     """
 
     helper = LayerHelper('embedding', **locals())
-    remote_prefetch = False
-    if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'):
-        remote_prefetch = True
+    remote_prefetch = is_sparse and (not is_distributed)
     if remote_prefetch:
         assert is_sparse is True and is_distributed is False
     w = helper.create_parameter(
@@ -505,31 +503,33 @@ def lstm(input,
     In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
     the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
-    $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
+    .. math::
+
+       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i)
 
-    $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
+       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f)
 
-    $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
+       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o)
 
-    $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
+       \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
 
-    $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
+       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
 
-    $$ h_t = o_t \\odot tanh(c_t) $$
+       h_t &= o_t \odot tanh(c_t)
 
-    - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+    - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
       of weights from the input gate to the input)
     - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
     - sigmoid is the logistic sigmoid function.
     - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
       and cell activation vectors, respectively, all of which have the same size as
       the cell output activation vector $h$.
-    - The $\odot$ is the element-wise product of the vectors.
-    - `tanh` is the activation functions.
-    - $\tilde{c_t}$ is also called candidate hidden state,
+    - The :math:`\odot` is the element-wise product of the vectors.
+    - :math:`tanh` is the activation functions.
+    - :math:`\\tilde{c_t}` is also called candidate hidden state,
       which is computed based on the current input and the previous hidden state.
 
-    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
+    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication,
     X represensts a matrix multiplication
 
 
@@ -556,14 +556,18 @@ def lstm(input,
 
 
     Returns:
-        rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
-                         if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
-        last_h(Tensor): the hidden state of the last step of LSTM
-                        shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
-        last_c(Tensor): the cell state of the last step of LSTM
-                        shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
+        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):
+
+                        Three tensors, rnn_out, last_h, last_c:
+
+                        - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
+                          if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
+                        - last_h is the hidden state of the last step of LSTM \
+                          shape is ( num_layers x batch_size x hidden_size ) \
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
+                        - last_c(Tensor): the cell state of the last step of LSTM \
+                          shape is ( num_layers x batch_size x hidden_size ) \
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
 
 
     Examples:
@@ -1220,6 +1224,8 @@ def dropout(x,
     probability) the outputs of some units to zero, while others are remain
     unchanged.
 
+    dropout op can be removed from the program to make the program more efficient.
+
     Args:
         x (Variable): The input tensor variable.
         dropout_prob (float): Probability of setting units to zero.
@@ -1230,20 +1236,22 @@ def dropout(x,
                     units will be dropped. DO NOT use a fixed seed in training.
         name (str|None): A name for this layer(optional). If set None, the layer
                          will be named automatically.
-        dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train']
+        dropout_implementation(string): ['downgrade_in_infer'(default)|'upscale_in_train']
+
                                         1. downgrade_in_infer(default), downgrade the outcome at inference
-                                           train: out = input * mask
-                                           inference: out = input * dropout_prob
-                                           (make is a tensor same shape with input, value is 0 or 1
-                                            ratio of 0 is dropout_prob)
+
+                                           - train: out = input * mask
+                                           - inference: out = input * dropout_prob
+
+                                           (mask is a tensor same shape with input, value is 0 or 1
+                                           ratio of 0 is dropout_prob)
                                         2. upscale_in_train, upscale the outcome at training time
-                                           train: out = input * mask / ( 1.0 - dropout_prob )
-                                           inference: out = input
-                                           (make is a tensor same shape with input, value is 0 or 1
-                                            ratio of 0 is dropout_prob)
-                                           dropout op can be removed from the program.
-                                           the program will be efficient
 
+                                           - train: out = input * mask / ( 1.0 - dropout_prob )
+                                           - inference: out = input
+
+                                           (mask is a tensor same shape with input, value is 0 or 1
+                                           ratio of 0 is dropout_prob)
 
 
     Returns:
@@ -1333,11 +1341,15 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
          A 2-D tensor with shape [N x 1], the cross entropy loss.
 
     Raises:
-        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal.
-                      2) when `soft_label == True`, and the 2nd dimension of
-                         `input` and `label` are not equal.
-                      3) when `soft_label == False`, and the 2nd dimension of
-                         `label` is not 1.
+         ValueError:
+
+                      1. the 1st dimension of ``input`` and ``label`` are not equal.
+
+                      2. when ``soft_label == True``, and the 2nd dimension of
+                         ``input`` and ``label`` are not equal.
+
+                      3. when ``soft_label == False``, and the 2nd dimension of
+                         ``label`` is not 1.
 
     Examples:
         .. code-block:: python
@@ -1458,7 +1470,7 @@ def chunk_eval(input,
     F1-score of chunk detection.
 
     For some basics of chunking, please refer to
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
 
     ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
     and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
@@ -1823,7 +1835,7 @@ def conv2d(input,
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-             and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
@@ -2276,7 +2288,7 @@ def sequence_slice(input, offset, length, name=None):
 
     .. code-block:: text
 
-	- Case:
+              - Case:
 
             Given the input Variable **input**:
 
@@ -2292,7 +2304,8 @@ def sequence_slice(input, offset, length, name=None):
                 out.lod = [[2, 1]],
                 out.dims = (3, 2).
 
-    NOTE: The first dimension size of **input**, **offset** and **length**
+    Note:
+          The first dimension size of **input**, **offset** and **length**
           should be equal. The **offset** should start from 0.
 
     Args:
@@ -2540,12 +2553,12 @@ def adaptive_pool2d(input,
     Examples:
         .. code-block:: python
 
-          # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], 
+          # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n],
           # output shape is [N, C, m, n], adaptive pool divide H and W dimentions
-          # of input data into m * n grids averagely and performs poolings in each 
+          # of input data into m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
-          # 
+          #
           #     for i in range(m):
           #         for j in range(n):
           #             hstart = floor(i * H / m)
@@ -2570,12 +2583,7 @@ def adaptive_pool2d(input,
         raise ValueError(
             "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
 
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2:
-        raise ValueError(
-            "'pool_size' should be a list or tuple with length as 2.")
+    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
 
     if pool_type == "max":
         l_type = 'max_pool2d_with_index'
@@ -2639,10 +2647,10 @@ def adaptive_pool3d(input,
 
           # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
           # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
-          # of input data into l * m * n grids averagely and performs poolings in each 
+          # of input data into l * m * n grids averagely and performs poolings in each
           # grid to get output.
           # adaptive average pool performs calculations as follow:
-          # 
+          #
           #     for i in range(l):
           #         for j in range(m):
           #             for k in range(n):
@@ -2652,7 +2660,7 @@ def adaptive_pool3d(input,
           #                 hend = ceil((j + 1) * H / m)
           #                 wstart = floor(k * W / n)
           #                 wend = ceil((k + 1) * W / n)
-          #                 output[:, :, i, j, k] = 
+          #                 output[:, :, i, j, k] =
           #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
           #
           data = fluid.layers.data(
@@ -2671,12 +2679,7 @@ def adaptive_pool3d(input,
         raise ValueError(
             "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
 
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3:
-        raise ValueError(
-            "'pool_size' should be a list or tuple with length as 3.")
+    pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
 
     if pool_type == "max":
         l_type = 'max_pool3d_with_index'
@@ -3013,7 +3016,7 @@ def group_norm(input,
     """
     **Group Normalization Layer**
 
-    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
 
     Args:
         input(Variable): The input tensor variable.
@@ -3140,8 +3143,8 @@ def conv2d_transpose(input,
 
            H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
            W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} \in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} \in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
+           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
+           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
 
     Args:
         input(Variable): The input image with [N, C, H, W] format.
@@ -4704,9 +4707,9 @@ def ctc_greedy_decoder(input, blank, name=None):
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
-                  'Lp' is the sum if all output sequences' length. If all the sequences
-                  in result were empty, the result LoDTensor will be [-1] with
+        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
+                  'Lp' is the sum if all output sequences' length. If all the sequences \
+                  in result were empty, the result LoDTensor will be [-1] with  \
                   LoD [[]] and dims [1, 1].
 
     Examples:
@@ -5027,12 +5030,18 @@ def nce(input,
     else:
         num_neg_samples = int(num_neg_samples)
 
+    remote_prefetch = is_sparse
+    print(
+        "With sparse mode, if your models has only small parameter prefetch may cause speed down"
+    )
+
     attrs = {
         'num_total_classes': int(num_total_classes),
         'num_neg_samples': num_neg_samples,
         'seed': seed,
         'sampler': sampler,
-        'is_sparse': is_sparse
+        'is_sparse': is_sparse,
+        'remote_prefetch': remote_prefetch
     }
 
     helper.append_op(
@@ -5072,13 +5081,13 @@ def hsigmoid(input,
     <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
 
     And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first:
-        1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
-        2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
-        3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
-         means label of each binary classification, using 1 indicate true, 0 indicate false.
-        4. now, each word should has its path and code along the path, you can pass a batch of path and code
-        related to the same batch of inputs.
 
+    1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
+    2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
+    3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
+       means label of each binary classification, using 1 indicate true, 0 indicate false.
+    4. now, each word should has its path and code along the path, you can pass a batch of path and code
+       related to the same batch of inputs.
 
     Args:
         input (Variable): The input tensor variable with shape
@@ -5142,7 +5151,10 @@ def hsigmoid(input,
         pass
 
     weights = None
-
+    remote_prefetch = is_sparse
+    print(
+        "With sparse mode, if your models has only small parameter prefetch may cause speed down"
+    )
     if not is_custom:
         weights = helper.create_parameter(
             attr=helper.param_attr,
@@ -5158,7 +5170,7 @@ def hsigmoid(input,
     inputs = {
         "X": input,
         "W": weights,
-        "PTable": path_table,
+        "PathTable": path_table,
         "PathCode": path_code,
         "Label": label
     }
@@ -5181,9 +5193,13 @@ def hsigmoid(input,
         type="hierarchical_sigmoid",
         inputs=inputs,
         outputs={"Out": out,
-                 "PreOut": pre_out},
-        attrs={"num_classes": num_classes,
-               "is_sparse": is_sparse})
+                 "PreOut": pre_out,
+                 "W_Out": weights},
+        attrs={
+            "num_classes": num_classes,
+            "is_sparse": is_sparse,
+            "remote_prefetch": remote_prefetch
+        })
     return out
 
 
@@ -5485,11 +5501,11 @@ def softmax_with_cross_entropy(logits,
 
     .. math::
 
-        max_j = \\max_{i=0}^{K}{\\text{logit}_i}
+        max_j &= \\max_{i=0}^{K}{\\text{logit}_i}
 
-        log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
+        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
 
-        softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
+        softmax_j &= \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
 
     and then cross entropy loss is calculated by softmax and label.
 
@@ -5515,11 +5531,11 @@ def softmax_with_cross_entropy(logits,
                                along with the cross entropy loss. Default: False
 
     Returns:
-        Variable or Tuple of two Variables: Return the cross entropy loss if
-                              `return_softmax` is False, otherwise the tuple
-                              (loss, softmax), where the cross entropy loss is
-                              a 2-D tensor with shape [N x 1], and softmax is a
-                              2-D tensor with shape [N x K].
+        Variable or Tuple of two Variables: Return the cross entropy loss if \
+                                            `return_softmax` is False, otherwise the tuple \
+                                            (loss, softmax), where the cross entropy loss is \
+                                            a 2-D tensor with shape [N x 1], and softmax is a \
+                                            2-D tensor with shape [N x K].
 
     Examples:
         .. code-block:: python
@@ -5792,21 +5808,27 @@ def squeeze(input, axes, name=None):
     the single dimensions will be removed from the shape. If an axis is
     selected with shape entry not equal to one, an error is raised.
 
-    Examples:
-    Case 1:
-      Given
-        X.shape = (1, 3, 1, 5)
-      and
-        axes = [0]
-      we get:
-        Out.shape = (3, 1, 5)
-      Case 2:
-        Given
-          X.shape = (1, 3, 1, 5)
-        and
-          axes = []
-        we get:
-          Out.shape = (3, 5)
+    For example:
+
+    .. code-block:: text
+
+        Case 1:
+
+          Given
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = [0]
+          we get:
+            Out.shape = (3, 1, 5)
+
+        Case 2:
+
+          Given
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = []
+          we get:
+            Out.shape = (3, 5)
 
     Args:
         input (Variable): The input variable to be squeezed.
@@ -5842,6 +5864,9 @@ def unsqueeze(input, axes, name=None):
     Dimension indices in axes are as seen in the output tensor.
 
     For example:
+
+    .. code-block:: text
+
       Given a tensor such that tensor with shape [3, 4, 5],
       then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
 
@@ -6729,8 +6754,11 @@ def sequence_scatter(input, index, updates, name=None):
     the columns to update in each row of X.
 
     Here is an example:
+
     Given the following input:
+
     .. code-block:: text
+
         input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                       [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
                       [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
@@ -6743,7 +6771,9 @@ def sequence_scatter(input, index, updates, name=None):
         updates.lod =  [[  0,            3,                                 8,                         12]]
 
     Then we have the output:
+
     .. code-block:: text
+
         out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0],
                     [1.0, 1.0, 1.4, 1.3, 1.2, 1.1],
                     [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]]
@@ -6759,7 +6789,7 @@ def sequence_scatter(input, index, updates, name=None):
         name (str|None): The output variable name. Default None.
 
     Returns:
-        output (Variable): The output is a tensor with the same shape as input.
+        Variable: The output is a tensor with the same shape as input.
 
     Examples:
 
@@ -6933,7 +6963,7 @@ def mean_iou(input, label, num_classes):
 
     .. math::
 
-        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
+        IOU = \\frac{true\_positive}{(true\_positive + false\_positive + false\_negative)}.
 
     The predictions are accumulated in a confusion matrix and mean-IOU
     is then calculated from it.
@@ -6946,9 +6976,13 @@ def mean_iou(input, label, num_classes):
         num_classes (int): The possible number of labels.
 
     Returns:
-        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
-        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
-        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class.
+        mean_iou (Variable),out_wrong(Variable),out_correct(Variable):
+
+                     Three variables:
+
+                     - mean_iou : A Tensor representing the mean intersection-over-union with shape [1].
+                     - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class.
+                     - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class.
 
     Examples:
 
@@ -7144,7 +7178,7 @@ def affine_grid(theta, out_shape, name=None):
     Args:
         theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
         out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
-        out_shape can be a Variable or a list or tuple.
+                                             ``out_shape`` can be a Variable or a list or tuple.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
 
@@ -7157,6 +7191,7 @@ def affine_grid(theta, out_shape, name=None):
     Examples:
 
         .. code-block:: python
+
             theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
             out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
             data = fluid.layers.affine_grid(theta, out_shape)
@@ -7192,9 +7227,10 @@ def affine_grid(theta, out_shape, name=None):
 
 def rank_loss(label, left, right, name=None):
     """
+
     **Rank loss layer for RankNet**
 
-    RankNet(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)
+    `RankNet <http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf>`_
     is a pairwise ranking model with a training sample consisting of a pair
     of documents, A and B. Label P indicates whether A is ranked higher than B
     or not:
@@ -7202,16 +7238,19 @@ def rank_loss(label, left, right, name=None):
     P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information
     about the rank of the input pair.
 
-    Rank loss layer takes three inputs: left (o_i), right (o_j) and
-    label (P_{i,j}). The inputs respectively represent RankNet's output scores
+    Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and
+    label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores
     for documents A and B and the value of label P. The following equation
     computes rank loss C_{i,j} from the inputs:
 
-    $$
-      C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
-      o_{i,j} =  o_i - o_j  \\
-      \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-    $$
+    .. math::
+
+      C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\
+
+      o_{i,j} &=  o_i - o_j  \\\\
+
+      \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \}
+
 
     Rank loss layer takes batch inputs with size batch_size (batch_size >= 1).
 
@@ -7237,7 +7276,6 @@ def rank_loss(label, left, right, name=None):
             right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
             out = fluid.layers.rank_loss(label, left, right)
 
-
     """
     helper = LayerHelper('rank_loss', **locals())
 
@@ -7269,7 +7307,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
 
     .. math::
 
-        rank\_loss &= max(0, -label * (left - right) + margin)
+        rank\_loss = max(0, -label * (left - right) + margin)
 
     Args:
        label (Variable): Indicates whether the left is ranked higher than the right or not.
@@ -7278,12 +7316,17 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
        margin (float): Indicates the given margin.
        name (str|None): A name for this layer (optional). If set None, the layer
                        will be named automatically.
+
     Returns:
        Variable: The ranking loss.
+
     Raises:
        ValueError: Any of label, left, and right is not a Variable.
+
     Examples:
+
         .. code-block:: python
+
            label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
            left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
            right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
@@ -7587,7 +7630,8 @@ def prelu(x, mode, param_attr=None, name=None):
     """
     Equation:
 
-        y = \max(0, x) + alpha * \min(0, x)
+    .. math::
+        y = \max(0, x) + \\alpha * \min(0, x)
 
     Args:
         x (Variable): The input tensor.
@@ -7651,10 +7695,10 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
 
     Examples:
 
-        .. code-block:: python
+    .. code-block:: python
 
-        x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-        y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0)
+            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
+            y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0)
     """
     helper = LayerHelper('brelu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7683,8 +7727,8 @@ def leaky_relu(x, alpha=0.02, name=None):
 
         .. code-block:: python
 
-        x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-        y = fluid.layers.leaky_relu(x, alpha=0.01)
+            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
+            y = fluid.layers.leaky_relu(x, alpha=0.01)
     """
     helper = LayerHelper('leaky_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7712,8 +7756,8 @@ def soft_relu(x, threshold=40.0, name=None):
 
         .. code-block:: python
 
-        x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
-        y = fluid.layers.soft_relu(x, threshold=20.0)
+            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
+            y = fluid.layers.soft_relu(x, threshold=20.0)
     """
     helper = LayerHelper('soft_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7730,22 +7774,31 @@ def flatten(x, axis=1, name=None):
     **Flatten layer**
     Flattens the input tensor into a 2D matrix.
 
-    Examples:
-    Case 1:
-      Given
-        X.shape = (3, 100, 100, 4)
-      and
-        axis = 2
-      We get:
-        Out.shape = (3 * 100, 4 * 100)
-
-    Case 2:
-      Given
-        X.shape = (3, 100, 100, 4)
-      and
-        axis = 0
-      We get:
-        Out.shape = (1, 3 * 100 * 100 * 4)
+    For Example:
+
+    .. code-block:: text
+
+        Case 1:
+
+          Given
+            X.shape = (3, 100, 100, 4)
+
+          and
+            axis = 2
+
+          We get:
+            Out.shape = (3 * 100, 4 * 100)
+
+        Case 2:
+
+          Given
+            X.shape = (3, 100, 100, 4)
+
+          and
+            axis = 0
+
+          We get:
+            Out.shape = (1, 3 * 100 * 100 * 4)
 
     Args:
         x (Variable): A tensor of rank >= axis.
@@ -7759,9 +7812,9 @@ def flatten(x, axis=1, name=None):
                         will be named automatically.
 
     Returns:
-        Variable: A 2D tensor with the contents of the input tensor, with input
-                  dimensions up to axis flattened to the outer dimension of
-                  the output and remaining input dimensions flattened into the
+        Variable: A 2D tensor with the contents of the input tensor, with input \
+                  dimensions up to axis flattened to the outer dimension of \
+                  the output and remaining input dimensions flattened into the \
                   inner dimension of the output.
 
     Raises:
@@ -7801,19 +7854,23 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
     The enumerated sequence has the same 1st dimension with variable `input`, and
     the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
 
-    Examples:
-    Case 1:
-      Input:
-        X.lod = [[0, 3, 5]]
-        X.data = [[1], [2], [3], [4], [5]]
-        X.dims = [5, 1]
-      Attrs:
-        win_size = 2
-        pad_value = 0
-      Output:
-        Out.lod = [[0, 3, 5]]
-        Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
-        Out.dims = [5, 2]
+    .. code-block:: text
+
+        Case 1:
+
+          Input:
+            X.lod = [[0, 3, 5]]
+            X.data = [[1], [2], [3], [4], [5]]
+            X.dims = [5, 1]
+
+          Attrs:
+            win_size = 2
+            pad_value = 0
+
+          Output:
+            Out.lod = [[0, 3, 5]]
+            Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+            Out.dims = [5, 2]
 
     Args:
         input (Variable): The input variable which is a index sequence.
@@ -8896,6 +8953,7 @@ def similarity_focus(input, axis, indexes, name=None):
     SimilarityFocus Operator
 
     Generate a similarity focus mask with the same shape of input using the following method:
+
     1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
        to the axis according to the indexes. For example, if axis=1 and indexes=[a],
        it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
@@ -8969,14 +9027,16 @@ def similarity_focus(input, axis, indexes, name=None):
         indexes(list): Indicating the indexes of the selected dimension.
 
     Returns:
-        Variable: A tensor variable with the same shape and same type
-            as the input.
+        Variable: A tensor variable with the same shape and same type \
+                  as the input.
 
     Examples:
         .. code-block:: python
+
             data = fluid.layers.data(
               name='data', shape=[2, 3, 2, 2], dtype='float32')
             x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
+
     """
     helper = LayerHelper('similarity_focus', **locals())
     # check attrs
@@ -9055,6 +9115,7 @@ def hash(input, hash_size, num_hash=1, name=None):
 
     Examples:
        .. code-block:: python
+
            word_dict = paddle.dataset.imdb.word_dict()
            x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
            out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000)
@@ -9075,50 +9136,52 @@ def hash(input, hash_size, num_hash=1, name=None):
 def grid_sampler(x, grid, name=None):
     """
     This operation samples input X by using bilinear interpolation based on
-    flow field grid, which is usually gennerated by affine_grid. The grid of
+    flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of
     shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
     with shape [N, H, W] each, where grid_x is indexing the 4th dimension
     (in width dimension) of input data x and grid_y is indexng the 3rd
     dimention (in height dimension), finally results is the bilinear
     interpolation value of 4 nearest corner points.
 
-    Step 1:
-    Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+    .. code-block:: text
 
-    grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
-    grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+        Step 1:
+        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
 
-    Step 2:
-    Indices input data X with grid (x, y) in each [H, W] area, and bilinear
-    interpolate point value by 4 nearest points.
+        grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+        grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
 
-      wn ------- y_n ------- en
-      |           |           |
-      |          d_n          |
-      |           |           |
-     x_w --d_w-- grid--d_e-- x_e
-      |           |           |
-      |          d_s          |
-      |           |           |
-      ws ------- y_s ------- wn
+        Step 2:
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
+        interpolate point value by 4 nearest points.
 
-    x_w = floor(x)              // west side x coord
-    x_e = x_w + 1               // east side x coord
-    y_n = floor(y)              // north side y coord
-    y_s = y_s + 1               // south side y coord
+          wn ------- y_n ------- en
+          |           |           |
+          |          d_n          |
+          |           |           |
+         x_w --d_w-- grid--d_e-- x_e
+          |           |           |
+          |          d_s          |
+          |           |           |
+          ws ------- y_s ------- wn
 
-    d_w = grid_x - x_w          // distance to west side
-    d_e = x_e - grid_x          // distance to east side
-    d_n = grid_y - y_n          // distance to north side
-    d_s = y_s - grid_y          // distance to south side
+        x_w = floor(x)              // west side x coord
+        x_e = x_w + 1               // east side x coord
+        y_n = floor(y)              // north side y coord
+        y_s = y_s + 1               // south side y coord
 
-    wn = X[:, :, y_n, x_w]      // north-west point value
-    en = X[:, :, y_n, x_e]      // north-east point value
-    ws = X[:, :, y_s, x_w]      // south-east point value
-    es = X[:, :, y_s, x_w]      // north-east point value
+        d_w = grid_x - x_w          // distance to west side
+        d_e = x_e - grid_x          // distance to east side
+        d_n = grid_y - y_n          // distance to north side
+        d_s = y_s - grid_y          // distance to south side
 
-    output = wn * d_e * d_s + en * d_w * d_s
-           + ws * d_e * d_n + es * d_w * d_n
+        wn = X[:, :, y_n, x_w]      // north-west point value
+        en = X[:, :, y_n, x_e]      // north-east point value
+        ws = X[:, :, y_s, x_w]      // south-east point value
+        es = X[:, :, y_s, x_w]      // north-east point value
+
+        output = wn * d_e * d_s + en * d_w * d_s
+               + ws * d_e * d_n + es * d_w * d_n
 
     Args:
         x(Variable): Input data of shape [N, C, H, W].
@@ -9126,16 +9189,18 @@ def grid_sampler(x, grid, name=None):
         name (str, default None): The name of this layer.
 
     Returns:
-        out(Variable): Output of shape [N, C, H, W] data samples input X
+        Variable: Output of shape [N, C, H, W] data samples input X
         using bilnear interpolation based on input grid.
 
-    Exmples:
-    .. code-block:: python
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
+            theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
+            grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
+            out = fluid.layers.grid_sampler(x=x, grid=grid)
 
-        x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
-        theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
-        grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
-        out = fluid.layers.grid_sampler(x=x, grid=grid)
     """
     helper = LayerHelper("grid_sampler", **locals())
 
@@ -9203,19 +9268,19 @@ def add_position_encoding(input, alpha, beta, name=None):
     """
     **Add Position Encoding Layer**
 
-    This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
+    This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an
     output Tensor of shape [N x M x P] with positional encoding value.
 
-    Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
+    Refer to `Attention Is All You Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .
 
     .. math::
-        PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})}   \\\\
-        PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})}  \\\\
-        Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
+        PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})}   \\\\
+        PE(pos, 2i + 1) &= \\cos{(pos / 10000^{2i / P})}  \\\\
+        Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
 
     Where:
-    * PE(pos, 2i): the increment for the number at even position
-    * PE(pos, 2i + 1): the increment for the number at odd position
+      - :math:`PE(pos, 2i)` : the increment for the number at even position
+      - :math:`PE(pos, 2i + 1)` : the increment for the number at odd position
 
     Args:
         input (Variable): 3-D input tensor with shape [N x M x P]
@@ -9230,6 +9295,7 @@ def add_position_encoding(input, alpha, beta, name=None):
         .. code-block:: python
 
           position_tensor = fluid.layers.add_position_encoding(input=tensor)
+
     """
     helper = LayerHelper('add_position_encoding', **locals())
     dtype = helper.input_dtype()
@@ -9262,13 +9328,13 @@ def bilinear_tensor_product(x,
     For example:
 
     .. math::
-       out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
+       out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1
 
     In this formula:
       - :math:`x`: the first input contains M elements, shape is [batch_size, M].
       - :math:`y`: the second input contains N elements, shape is [batch_size, N].
       - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
-      - :math:`out{i}`: the i-th element of out, shape is [batch_size, size].
+      - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
       - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
 
     Args:
@@ -9348,7 +9414,7 @@ class PyFuncRegistry(object):
             raise TypeError('func must be a Python function')
 
         self._func = func
-        # find named args using reflection 
+        # find named args using reflection
         args = inspect.getargspec(self._func)
         if len(args[0]) == 0 and args[1] is None and args[2] is None:
             # Function with no inputs
@@ -9359,15 +9425,15 @@ class PyFuncRegistry(object):
         '''
         Why record self here?
 
-        1. For debug usage. Users can call 
-           :code:`py_func.registered_func(idx)` method 
+        1. For debug usage. Users can call
+           :code:`py_func.registered_func(idx)` method
            to find the registered function corresponding
-           to :code:`idx`. 
+           to :code:`idx`.
 
-        2. For increasing reference count of self. 
-           It seems that to release Python object 
+        2. For increasing reference count of self.
+           It seems that to release Python object
            whose reference count is 1 would cause
-           segmentation fault error in C++ side. 
+           segmentation fault error in C++ side.
            May be lack of Python GC in C++ side?
         '''
         PyFuncRegistry._register_funcs.append(self)
@@ -9418,7 +9484,7 @@ class PyFuncRegistry(object):
 def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
     """
     PyFunc Operator.
-    
+
     User can use :code:`py_func` to register operators in Python side.
     The inputs of :code:`func` is :code:`LoDTensor` and outputs can be
     numpy array or :code:`LoDTensor`. Paddle would call the registered
@@ -9436,7 +9502,7 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
     no gradient, users should return None.
 
     This function can also be used to debug the running network. User can
-    add a :code:`py_func` operator without output, and print input 
+    add a :code:`py_func` operator without output, and print input
     :code:`x` inside :code:`func`.
 
     Args:
@@ -9444,50 +9510,50 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
         x (Variable|list(Variable)|tuple(Variable)): inputs of :code:`func`.
         out (Variable|list(Variable)|tuple(Variable)): outputs of :code:`func`.
             Paddle cannot infer shapes and data types of :code:`out`. Users
-            should create :code:`out` beforehand. 
+            should create :code:`out` beforehand.
         backward_func (callable|None): backward Python function.
-                                       None means no backward. Default None. 
+                                       None means no backward. Default None.
         skip_vars_in_backward_input (Variable|list(Variable)|tuple(Variable)):
-            Variables that are not needed in :code:`backward_func` inputs. 
+            Variables that are not needed in :code:`backward_func` inputs.
             These variables must be any of :code:`x` and :code:`out`.
             If set, these vars would not be inputs of :code:`backward_func`,
-            Only useful when :code:`backward_func` is not None. Default None. 
+            Only useful when :code:`backward_func` is not None. Default None.
 
     Returns:
         out (Variable|list(Variable)|tuple(Variable)): input :code:`out`
 
     Examples:
-    
+
         >>> import paddle.fluid as fluid
         >>> import six
         >>>
         >>> def create_tmp_var(name, dtype, shape):
         >>>     return fluid.default_main_program().current_block().create_var(
-        >>>         name=name, dtype=dtype, shape=shape) 
+        >>>         name=name, dtype=dtype, shape=shape)
         >>>
         >>> # tanh activation has been provided by Paddle C++ op
-        >>> # Here, we only use tanh to be an example to show the usage 
+        >>> # Here, we only use tanh to be an example to show the usage
         >>> # of py_func
         >>> def tanh(x):
         >>>     return np.tanh(x)
-        >>> 
+        >>>
         >>> # forward input x is skipped
         >>> def tanh_grad(y, dy):
         >>>     return np.array(dy) * (1 - np.square(np.array(y)))
         >>>
         >>> def debug_func(x):
-        >>>     print(x) 
+        >>>     print(x)
         >>>
         >>> def simple_net(img, label):
         >>>     hidden = img
         >>>     for idx in six.moves.range(4):
         >>>         hidden = fluid.layers.fc(hidden, size=200)
         >>>         new_hidden = create_tmp_var(name='hidden_{}'.format(idx),
-        >>>             dtype=hidden.dtype, shape=hidden.shape)    
+        >>>             dtype=hidden.dtype, shape=hidden.shape)
         >>>
         >>>         # user-defined layers with forward and backward
-        >>>         hidden = fluid.layers.py_func(func=tanh, x=hidden, 
-        >>>             out=new_hidden, backward_func=tanh_grad, 
+        >>>         hidden = fluid.layers.py_func(func=tanh, x=hidden,
+        >>>             out=new_hidden, backward_func=tanh_grad,
         >>>             skip_vars_in_backward_input=hidden)
         >>>
         >>>         # user-defined debug layers to print variables
@@ -9658,47 +9724,3 @@ def huber_loss(input, label, delta):
                  'Residual': residual},
         attrs={'delta': delta})
     return out
-
-
-class FC(layers.PyLayer):
-    def __init__(self,
-                 size,
-                 param_attr=None,
-                 num_flatten_dims=1,
-                 dtype=core.VarDesc.VarType.FP32):
-        super(FC, self).__init__()
-        self._size = size
-        self._num_flatten_dims = num_flatten_dims
-        self._dtype = dtype
-        self._helper = LayerHelper('FC', param_attr=param_attr)
-
-    def _build_once(self, inputs):
-        input_shape = inputs[0].shape
-        param_shape = [
-            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
-        ] + [self._size]
-        self._w = self._helper.create_parameter(
-            attr=self._helper.param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=False)
-
-    def forward(self, inputs):
-        tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": inputs[0],
-                    "Y": self._w},
-            outputs={"Out": tmp},
-            attrs={
-                "x_num_col_dims": self._num_flatten_dims,
-                "y_num_col_dims": 1
-            })
-
-        out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="sum",
-            inputs={"X": [tmp]},
-            outputs={"Out": out},
-            attrs={"use_mkldnn": False})
-        return out
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 49a486cf0c..ce9f508c9f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -20,6 +20,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
+from ..imperative import base as imperative_base
 from .layer_function_generator import templatedoc
 import numpy
 
@@ -104,15 +105,15 @@ def create_global_var(shape,
 
     Args:
         shape(list[int]): shape of the variable
-        value(float): the value of the variable. The new created 
+        value(float): the value of the variable. The new created
                       variable will be filled with it.
         dtype(string): data type of the variable
-        persistable(bool): if this variable is persistable. 
+        persistable(bool): if this variable is persistable.
                            Default: False
-        force_cpu(bool): force this variable to be on CPU. 
+        force_cpu(bool): force this variable to be on CPU.
                          Default: False
-        name(str|None): The name of the variable. If set to None the variable 
-                        name will be generated automatically. 
+        name(str|None): The name of the variable. If set to None the variable
+                        name will be generated automatically.
                         Default: None
 
     Returns:
@@ -121,21 +122,26 @@ def create_global_var(shape,
     Examples:
         .. code-block:: python
 
-            var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', 
+            var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32',
                                  persistable=True, force_cpu=True, name='new_var')
     """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
-        dtype=dtype, shape=shape, persistable=persistable, name=name)
+        dtype=dtype,
+        shape=shape,
+        persistable=persistable,
+        name=name,
+        stop_gradient=True)
     helper.set_variable_initializer(
         var, initializer=Constant(
             value=float(value), force_cpu=force_cpu))
+
     return var
 
 
 def cast(x, dtype):
     """
-    This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts 
+    This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts
     it to the output with :attr:`dtype`.
 
     Args:
@@ -199,9 +205,9 @@ def tensor_array_to_tensor(input, axis=1, name=None):
     and returns that as the output.
 
     A simple example as below:
-    
+
     .. code-block:: text
-    
+
         Given:
 
         input.data = {[[0.6, 0.1, 0.3],
@@ -210,9 +216,9 @@ def tensor_array_to_tensor(input, axis=1, name=None):
                        [1.8]],
                       [[2.3, 2.1],
                        [2.5, 2.4]]}
-        
+
         axis = 1
-    
+
         Then:
 
         output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1],
@@ -393,9 +399,6 @@ def fill_constant_batch_size_like(input,
 
     It also sets *stop_gradient* to True.
 
-    >>> data = fluid.layers.fill_constant_batch_size_like(
-    >>>             input=like, shape=[1], value=0, dtype='int64')
-
     Args:
         input(${input_type}): ${input_comment}.
 
@@ -411,6 +414,14 @@ def fill_constant_batch_size_like(input,
 
     Returns:
         ${out_comment}.
+
+    Examples:
+
+        .. code-block:: python
+
+             data = fluid.layers.fill_constant_batch_size_like(
+                         input=like, shape=[1], value=0, dtype='int64')
+
     """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
     out = helper.create_variable_for_type_inference(dtype=dtype)
@@ -493,12 +504,12 @@ def argmax(x, axis=0):
 
 def argsort(input, axis=-1, name=None):
     """
-    Performs sorting on the input Variable along the given axis, and outputs 
-    sorted data Varibale and its corresponding index Variable with the same 
+    Performs sorting on the input Variable along the given axis, and outputs
+    sorted data Varibale and its corresponding index Variable with the same
     shape as :attr:`input`.
 
     .. code-block:: text
-    
+
         For example, the given axis is -1 and the input Variable
 
             input = [[0.15849551, 0.45865775, 0.8563702 ],
@@ -511,15 +522,15 @@ def argsort(input, axis=-1, name=None):
 
         and the sorted indices along the given axis turn outs to be
 
-            indices = [[0, 1, 2], 
+            indices = [[0, 1, 2],
                        [0, 2, 1]]
 
     Args:
         input(Variable): The input Variable for sorting.
-        axis(int): The axis along which to sort the input Variable. When 
-                   :attr:`axis` < 0, the actual axis will be :attr:`axis` + 
+        axis(int): The axis along which to sort the input Variable. When
+                   :attr:`axis` < 0, the actual axis will be :attr:`axis` +
                    rank(:attr:`input`). Default -1, the last dimension.
-        name(str|None): (optional) A name for this layer. If set None, the 
+        name(str|None): (optional) A name for this layer. If set None, the
                    layer will be named automatically.
 
     Returns:
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 85af8fea13..fd07ff0ba3 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -361,8 +361,8 @@ class ChunkEvaluator(MetricBase):
     Accumulate counter numbers output by chunk_eval from mini-batches and
     compute the precision recall and F1-score using the accumulated counter
     numbers.
-    For some basics of chunking, please refer to
-    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    For some basics of chunking, please refer to 
+    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
     ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
     and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
 
@@ -391,6 +391,7 @@ class ChunkEvaluator(MetricBase):
     def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
         """
         Update the states based on the layers.chunk_eval() ouputs.
+
         Args:
             num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
             num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
@@ -450,9 +451,9 @@ class EditDistance(MetricBase):
                 distance, instance_error = distance_evaluator.eval()
 
     In the above example:
-        'distance' is the average of the edit distance in a pass.
 
-        'instance_error' is the instance error rate in a pass.
+        - 'distance' is the average of the edit distance in a pass.
+        - 'instance_error' is the instance error rate in a pass.
 
     """
 
@@ -567,12 +568,15 @@ class DetectionMAP(object):
     Calculate the detection mean average precision (mAP).
 
     The general steps are as follows:
+
     1. calculate the true positive and false positive according to the input
-        of detection and labels.
+       of detection and labels.
     2. calculate mAP value, support two versions: '11 point' and 'integral'.
 
     Please get more information from the following articles:
+
       https://sanchom.wordpress.com/tag/average-precision/
+
       https://arxiv.org/abs/1512.02325
 
     Args:
@@ -613,10 +617,12 @@ class DetectionMAP(object):
                 for data in batches:
                     loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
 
-        In the above example:
+    In the above example:
+
+            - 'cur_map_v' is the mAP of current mini-batch.
+            - 'accum_map_v' is the accumulative mAP of one pass.
 
-            'cur_map_v' is the mAP of current mini-batch.
-            'accum_map_v' is the accumulative mAP of one pass.
+ 
     """
 
     def __init__(self,
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 59c22d4e49..779cb5f961 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -30,6 +30,7 @@ from .initializer import Constant
 from .layer_helper import LayerHelper
 from .layers import ops
 from .regularizer import append_regularization_ops
+from .imperative import base as imperative_base
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
@@ -301,25 +302,45 @@ class Optimizer(object):
         This method combines interface `append_backward()` and
         `create_optimization_pass()` into one.
         """
-        params_grads = append_backward(loss, parameter_list, no_grad_set,
-                                       [error_clip_callback])
+        if imperative_base.enabled():
+            if parameter_list is not None:
+                params_grads = parameter_list
+            else:
+                program = loss.block.program
+                parameters = program.global_block().all_parameters()
+                params_grads = []
+                for param in parameters:
+                    # create gradient variable
+                    grad_var = Variable(
+                        block=loss.block,
+                        name=param._ivar._grad_name(),
+                        stop_gradient=True)
+                    grad_var._value = param._ivar.grad_value
+                    params_grads.append((param, grad_var))
+
+            optimize_ops = self._create_optimization_pass(params_grads, loss,
+                                                          startup_program)
+        else:
+            params_grads = append_backward(loss, parameter_list, no_grad_set,
+                                           [error_clip_callback])
+
+            params_grads = sorted(params_grads, key=lambda x: x[0].name)
 
-        params_grads = sorted(params_grads, key=lambda x: x[0].name)
+            params_grads, table_param_and_grad, table_optimize_op = \
+                self._process_distribute_lookuptable(params_grads, loss, startup_program)
 
-        params_grads, table_param_and_grad, table_optimize_op = \
-            self._process_distribute_lookuptable(params_grads, loss, startup_program)
+            params_grads = append_gradient_clip_ops(params_grads)
 
-        params_grads = append_gradient_clip_ops(params_grads)
+            # Add regularization if any
+            params_grads = append_regularization_ops(params_grads,
+                                                     self.regularization)
 
-        # Add regularization if any
-        params_grads = append_regularization_ops(params_grads,
-                                                 self.regularization)
+            optimize_ops = self._create_optimization_pass(params_grads, loss,
+                                                          startup_program)
+            if table_optimize_op is not None:
+                optimize_ops.append(table_optimize_op)
+                params_grads.append(table_param_and_grad)
 
-        optimize_ops = self._create_optimization_pass(params_grads, loss,
-                                                      startup_program)
-        if table_optimize_op is not None:
-            optimize_ops.append(table_optimize_op)
-            params_grads.append(table_param_and_grad)
         return optimize_ops, params_grads
 
 
@@ -364,7 +385,8 @@ class SGDOptimizer(Optimizer):
                 "Grad": param_and_grad[1],
                 "LearningRate": self._create_param_lr(param_and_grad)
             },
-            outputs={"ParamOut": param_and_grad[0]})
+            outputs={"ParamOut": param_and_grad[0]},
+            stop_gradient=True)
 
         return sgd_op
 
@@ -448,7 +470,8 @@ class MomentumOptimizer(Optimizer):
                 "VelocityOut": velocity_acc
             },
             attrs={"mu": self._momentum,
-                   "use_nesterov": self._use_nesterov})
+                   "use_nesterov": self._use_nesterov},
+            stop_gradient=True)
 
         return momentum_op
 
@@ -477,7 +500,7 @@ class LarsMomentumOptimizer(Optimizer):
         regularization: A Regularizer, such as
                         fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
-        
+
 
     Examples:
         .. code-block:: python
@@ -533,7 +556,8 @@ class LarsMomentumOptimizer(Optimizer):
                 "mu": self._momentum,
                 "lars_coeff": self._lars_coeff,
                 "lars_weight_decay": self._lars_weight_decay
-            })
+            },
+            stop_gradient=True)
 
         return momentum_op
 
@@ -608,7 +632,8 @@ class AdagradOptimizer(Optimizer):
             },
             outputs={"ParamOut": param_and_grad[0],
                      "MomentOut": moment_acc},
-            attrs={"epsilon": self._epsilon})
+            attrs={"epsilon": self._epsilon},
+            stop_gradient=True)
 
         return adagrad_op
 
@@ -738,7 +763,8 @@ class AdamOptimizer(Optimizer):
                 "beta2": self._beta2,
                 "epsilon": self._epsilon,
                 "lazy_mode": self._lazy_mode
-            })
+            },
+            stop_gradient=True)
 
         return adam_op
 
@@ -760,13 +786,15 @@ class AdamOptimizer(Optimizer):
                     type="scale",
                     inputs={"X": beta1_pow_acc},
                     outputs={"Out": beta1_pow_acc},
-                    attrs={"scale": self._beta1})
+                    attrs={"scale": self._beta1},
+                    stop_gradient=True)
 
                 main_block.append_op(
                     type="scale",
                     inputs={"X": beta2_pow_acc},
                     outputs={"Out": beta2_pow_acc},
-                    attrs={"scale": self._beta2})
+                    attrs={"scale": self._beta2},
+                    stop_gradient=True)
 
 
 class AdamaxOptimizer(Optimizer):
@@ -877,7 +905,8 @@ class AdamaxOptimizer(Optimizer):
                 "beta1": self._beta1,
                 "beta2": self._beta2,
                 "epsilon": self._epsilon
-            })
+            },
+            stop_gradient=True)
 
         return adamax_op
 
@@ -897,7 +926,8 @@ class AdamaxOptimizer(Optimizer):
                     type="scale",
                     inputs={"X": beta1_pow_acc},
                     outputs={"Out": beta1_pow_acc},
-                    attrs={"scale": self._beta1})
+                    attrs={"scale": self._beta1},
+                    stop_gradient=True)
 
 
 class DecayedAdagradOptimizer(Optimizer):
@@ -979,7 +1009,8 @@ class DecayedAdagradOptimizer(Optimizer):
             },
             outputs={"ParamOut": param_and_grad[0],
                      "MomentOut": moment_acc},
-            attrs={"epsilon": self._epsilon})
+            attrs={"epsilon": self._epsilon},
+            stop_gradient=True)
 
         return decayed_adagrad_op
 
@@ -1075,7 +1106,8 @@ class AdadeltaOptimizer(Optimizer):
                 "AvgSquaredUpdateOut": avg_squared_update_acc
             },
             attrs={"epsilon": self._epsilon,
-                   "rho": self._rho})
+                   "rho": self._rho},
+            stop_gradient=True)
 
         return adadelta_op
 
@@ -1224,7 +1256,8 @@ class RMSPropOptimizer(Optimizer):
                 "decay": self._rho,
                 "momentum": self._momentum,
                 "centered": self._centered
-            })
+            },
+            stop_gradient=True)
 
         return rmsprop_op
 
@@ -1345,7 +1378,8 @@ class FtrlOptimizer(Optimizer):
             },
             attrs={"l1": self._l1,
                    "l2": self._l1,
-                   "lr_power": self._lr_power})
+                   "lr_power": self._lr_power},
+            stop_gradient=True)
 
         return ftrl_op
 
@@ -1509,7 +1543,8 @@ class ModelAverage(Optimizer):
                 "average_window": self.average_window,
                 "min_average_window": self.min_average_window,
                 "max_average_window": self.max_average_window,
-            })
+            },
+            stop_gradient=True)
 
     @contextmanager
     def apply(self, executor, need_restore=True):
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 74cf76da95..3b066eda11 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -29,6 +29,15 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 BuildStrategy = core.ParallelExecutor.BuildStrategy
 
 
+def _is_pserver_mode(main_program):
+    main = main_program if main_program \
+        else framework.default_main_program()
+    for op in main.global_block().ops:
+        if op.type in ["send", "recv"]:
+            return True
+    return False
+
+
 class ParallelExecutor(object):
     """
     ParallelExecutor is designed for data parallelism, which focuses on distributing
@@ -128,6 +137,11 @@ class ParallelExecutor(object):
             build_strategy = BuildStrategy()
         build_strategy.num_trainers = num_trainers
         build_strategy.trainer_id = trainer_id
+        # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
+        # num_trainers is 1, so the current fields of build_strategy doesn't tell if
+        # it's distributed model.
+        build_strategy.is_distribution = _is_pserver_mode(
+            main_program) or num_trainers > 1
 
         # step4: get main_program, scope, local_scopes
         main = main_program if main_program \
@@ -148,7 +162,7 @@ class ParallelExecutor(object):
                 trainers_endpoints), "num_trainers == len(end_points)"
             build_strategy.trainers_endpoints = trainers_endpoints
 
-        # step5: get persistable_vars, parameter_vars, places. persistable_vars
+        # step6: get persistable_vars, places. persistable_vars
         # need be broadcast to other local_scope.
         persistable_vars = set([
             cpt.to_text(v.name) for v in [
@@ -164,7 +178,7 @@ class ParallelExecutor(object):
 
         places = list(map(place_obj, self._places))
 
-        # step6: init ParallelExecutor
+        # step7: init ParallelExecutor
         self.executor = core.ParallelExecutor(
             places, persistable_vars, main.desc,
             cpt.to_text(loss_name)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index d744a00242..e87c1d58c8 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -185,8 +185,10 @@ def main(use_cuda, parallel):
 
 
 if __name__ == '__main__':
-    for use_cuda in (False, True):
-        for parallel in (False, True):
-            if use_cuda and not core.is_compiled_with_cuda():
-                continue
-            main(use_cuda=use_cuda, parallel=parallel)
+    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
+    if not on_ci:
+        for use_cuda in (False, True):
+            for parallel in (False, True):
+                if use_cuda and not core.is_compiled_with_cuda():
+                    continue
+                main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index c28c0809d8..e81632116c 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -22,18 +22,19 @@ if(NOT WITH_DISTRIBUTE)
     LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
     LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
     LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
+    LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op)
+    LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op)
 endif(NOT WITH_DISTRIBUTE)
 
 if (NOT ${WITH_GPU})
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
-elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+elseif(${CUDNN_VERSION} VERSION_LESS 7100)
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
 endif()
 
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
-list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
 
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
new file mode 100644
index 0000000000..5535427ea8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp, TestFP16MeanOp
+
+
+class TestNGRAPHMeanOp(TestMeanOp):
+    def setUp(self):
+        super(TestNGRAPHMeanOp, self).setUp()
+
+
+class TestNGRAPHFP16MeanOp(TestFP16MeanOp):
+    def setUp(self):
+        super(TestNGRAPHFP16MeanOp, self).setUp()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
new file mode 100644
index 0000000000..b42a1f73fa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
@@ -0,0 +1,40 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows, TestScaleFp16Op, TestScaleFp16OpSelectedRows
+
+
+class TestNGRAPHScaleOp(TestScaleOp):
+    def init_dtype_type(self):
+        pass
+
+
+class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows):
+    def init_dtype_type(self):
+        pass
+
+
+class TestNGRAPHScaleFp16Op(TestScaleFp16Op):
+    def init_dtype_type(self):
+        pass
+
+
+class TestNGRAPHScaleFp16OpSelectedRows(TestScaleFp16OpSelectedRows):
+    def init_dtype_type(self):
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index e2a9fc183e..2b0ab0cc3b 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -78,7 +78,6 @@ class TestParallelExecutorBase(unittest.TestCase):
             exec_strategy.allow_op_delay = allow_op_delay
             if use_fast_executor:
                 exec_strategy.use_experimental_executor = True
-
             build_strategy = fluid.BuildStrategy()
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                 if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
index 6cd71e39e4..ab34a51dd9 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
@@ -32,6 +32,8 @@ class TestConv2dFusionOp(OpTest):
         self.activation = 'relu'
         self.add_bias = True
         self.add_residual_data = True
+        self.channels = None
+        self.outputs = None
 
         self.init_group()
         self.init_dilation()
@@ -49,8 +51,9 @@ class TestConv2dFusionOp(OpTest):
         input = np.random.random(self.input_size).astype(self.dtype)
         filter = np.random.random(self.filter_size).astype(self.dtype)
 
-        output = conv2d_forward_naive(input, filter, self.groups,
-                                      conv2d_param).astype(self.dtype)
+        self.output, _, _, _, _ = conv2d_forward_naive(
+            input, filter, self.groups, conv2d_param)
+        self.output = self.output.astype(self.dtype)
 
         self.inputs = {
             'Input': OpTest.np_dtype_to_fluid_dtype(input),
@@ -58,19 +61,20 @@ class TestConv2dFusionOp(OpTest):
         }
 
         if self.add_residual_data:
-            residual_data = np.random.random(output.shape).astype(self.dtype)
+            residual_data = np.random.random(self.output.shape).astype(
+                self.dtype)
             self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
                 residual_data)
-            output += residual_data
+            self.output += residual_data
 
         if self.add_bias:
             bias = np.random.random(self.filter_size[0]).astype(self.dtype)
             self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias)
-            output = output + bias.reshape((1, bias.size, 1, 1))
+            self.output = self.output + bias.reshape((1, bias.size, 1, 1))
 
         assert self.activation in ['relu', 'identity']
         if self.activation == 'relu':
-            output = np.maximum(output, 0)
+            self.output = np.maximum(self.output, 0)
 
         self.attrs = {
             'strides': self.stride,
@@ -79,9 +83,12 @@ class TestConv2dFusionOp(OpTest):
             'dilations': self.dilations,
             'data_format': self.data_format,
             'exhaustive_search': self.exhaustive_search,
-            'activation': self.activation
+            'activation': self.activation,
+            'split_channels': self.channels
         }
-        self.outputs = {'Output': output}
+        self.outputs = {'Output': self.output}
+
+        self.set_outputs()
 
     def testcuda(self):
         return core.is_compiled_with_cuda()
@@ -117,6 +124,9 @@ class TestConv2dFusionOp(OpTest):
     def set_search_method(self):
         self.exhaustive_search = False
 
+    def set_outputs(self):
+        pass
+
 
 class TestWithoutResidual(TestConv2dFusionOp):
     def init_bias_residual(self):
@@ -160,5 +170,21 @@ class TestCUDNNExhaustiveSearch(TestConv2dFusionOp):
         self.exhaustive_search = True
 
 
+class TestMultipleOutputs(TestConv2dFusionOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [1, 32, 17, 17]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [126, f_c, 3, 3]
+        self.channels = [84, 42]
+
+    def set_outputs(self):
+        out1 = self.output[:, 0:84, :, :]
+        out2 = self.output[:, 84:126, :, :]
+        self.outputs['Outputs'] = [('out1', out1), ('out2', out2)]
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
new file mode 100644
index 0000000000..def188bfa6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py
@@ -0,0 +1,270 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_conv2d_op import conv2d_forward_naive, TestConv2dOp
+
+
+def conv2d_forward_refer(input, filter, group, conv_param):
+    out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
+                                                          conv_param)
+    out_tmp = np.zeros((in_n, out_h, out_w, out_c))
+    for n in range(in_n):
+        for i in range(out_h):
+            for j in range(out_w):
+                for m in range(out_c):
+                    out_tmp[n, i, j, m] = out[n, m, i, j]
+    return out_tmp.reshape(in_n, out_c, out_h, out_w)
+
+
+class TestConv2dInt8Op(TestConv2dOp):
+    def setUp(self):
+        self.op_type = "conv2d"
+        self.use_cudnn = False
+        self.exhaustive_search = False
+        self.use_cuda = False
+        self.use_mkldnn = False
+        self.data_format = "AnyLayout"
+        self.weighttype = np.float32
+        self.use_mkldnn = True
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+        self.init_fuse_relu()
+        self.init_data_type()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+
+        filter = np.random.random(self.filter_size).astype(self.weighttype)
+        if self.srctype == np.uint8:
+            input = np.random.randint(0, 10,
+                                      self.input_size).astype(self.srctype)
+        else:
+            input = np.random.randint(-5, 5,
+                                      self.input_size).astype(self.srctype)
+            input_shift = (np.ones(self.input_size) * 128).astype(np.uint8)
+
+        if self.srctype == np.int8:
+            filter_int = np.round(filter * self.scale_weights[0] *
+                                  0.5).astype(np.int32)
+            scale_output_shift = self.scale_out / (self.scale_in *
+                                                   self.scale_weights[0] * 0.5)
+            output1 = conv2d_forward_refer(
+                np.round((input.astype(np.int32) + input_shift) *
+                         self.scale_in).astype(np.int32), filter_int,
+                self.groups,
+                conv2d_param).astype(np.float32) * scale_output_shift
+            output2 = conv2d_forward_refer(
+                np.round((input_shift) * self.scale_in).astype(np.int32),
+                filter_int, self.groups,
+                conv2d_param).astype(np.float32) * scale_output_shift
+            if self.fuse_relu:
+                output = np.maximum(np.round(output1 - output2),
+                                    0).astype(self.dsttype)
+            else:
+                output = np.round(output1 - output2).astype(self.dsttype)
+        else:
+            filter_int = np.round(filter *
+                                  self.scale_weights[0]).astype(np.int32)
+            scale_output_shift = self.scale_out / (self.scale_in *
+                                                   self.scale_weights[0])
+            output1 = conv2d_forward_refer(
+                input.astype(np.int32), filter_int, self.groups,
+                conv2d_param).astype(np.float32)
+            if self.fuse_relu:
+                output = np.maximum(
+                    np.round(output1 * (self.scale_out / (
+                        self.scale_in * self.scale_weights[0]))),
+                    0).astype(self.dsttype)
+            else:
+                output = np.round(output1 * (self.scale_out / (
+                    self.scale_in *
+                    self.scale_weights[0]))).astype(self.dsttype)
+
+        self.inputs = {
+            'Input':
+            OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+        }
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format,
+            'exhaustive_search': self.exhaustive_search,
+            'Scale_in': self.scale_in,
+            'Scale_out': self.scale_out,
+            'Scale_weights': self.scale_weights,
+            'fuse_relu': self.fuse_relu
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), atol=0)
+
+    def test_check_grad(self):
+        pass
+
+    def test_check_grad_no_filter(self):
+        pass
+
+    def test_check_grad_no_input(self):
+        pass
+
+    def init_test_case(self):
+        TestConv2dOp.init_test_case(self)
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [1, f_c, 3, 3]
+        self.scale_in = 1.0
+        self.scale_out = 0.5
+        self.scale_weights = [10.0]
+
+    def init_data_type(self):
+        self.srctype = np.uint8
+        self.dsttype = np.int8
+
+    def init_fuse_relu(self):
+        self.fuse_relu = True
+
+
+#--------------------test conv2d u8 in and u8 out--------------------
+
+
+class TestConv2d(TestConv2dInt8Op):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.scale_in = 1.0
+        self.scale_out = 0.5
+        self.scale_weights = [10.0]
+
+
+class TestWithPad(TestConv2d):
+    def init_test_case(self):
+        TestConv2d.init_test_case(self)
+        self.pad = [1, 1]
+
+
+class TestWithGroup(TestConv2d):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithStride(TestConv2dInt8Op):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.scale_in = 1.0
+        self.scale_out = 0.8
+        self.scale_weights = [10.0]
+
+
+class TestWith1x1(TestConv2dInt8Op):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [1, 3, 5, 5]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+        self.scale_in = 1.0
+        self.scale_out = 0.5
+        self.scale_weights = [12.0]
+
+
+class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 1, 1]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+        self.scale_in = 1.0
+        self.scale_out = 0.5
+        self.scale_weights = [10.0]
+
+    def init_group(self):
+        self.groups = 3
+
+
+def init_data_type_with_fusion(self, input_dt, fuse_relu):
+    self.srctype = input_dt
+    self.dsttype = np.uint8 if fuse_relu else np.int8
+
+    def init_fuse_relu(self):
+        self.fuse_relu = fuse_relu
+
+
+def create_test_int8_class(parent):
+
+    #--------------------test conv2d s8 in and u8 out--------------------
+
+    class TestS8U8Case(parent):
+        def init_data_type(self):
+            init_data_type_with_fusion(self, np.int8, True)
+
+    #--------------------test conv2d s8 in and s8 out--------------------
+
+    class TestS8S8Case(parent):
+        def init_data_type(self):
+            init_data_type_with_fusion(self, np.int8, False)
+
+    #--------------------test conv2d u8 in and s8 out--------------------
+
+    class TestU8S8Case(parent):
+        def init_data_type(self):
+            init_data_type_with_fusion(self, np.uint8, False)
+
+    cls_name_s8u8 = "{0}_relu_{1}".format(parent.__name__, "1")
+    cls_name_s8s8 = "{0}_relu_{1}".format(parent.__name__, "0")
+    cls_name_u8s8 = "{0}_relu_{1}".format(parent.__name__, "0")
+    TestS8U8Case.__name__ = cls_name_s8u8
+    TestS8S8Case.__name__ = cls_name_s8s8
+    TestU8S8Case.__name__ = cls_name_u8s8
+    globals()[cls_name_s8u8] = TestS8U8Case
+    globals()[cls_name_s8s8] = TestS8S8Case
+    globals()[cls_name_u8s8] = TestU8S8Case
+
+
+create_test_int8_class(TestConv2dInt8Op)
+create_test_int8_class(TestWithPad)
+create_test_int8_class(TestWithStride)
+create_test_int8_class(TestWithGroup)
+create_test_int8_class(TestWith1x1)
+create_test_int8_class(TestWithInput1x1Filter1x1)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index bcb79f232b..25a9e8d46e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -60,7 +60,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
                         np.sum(input_pad_masked * f_sub[k, :, :, :],
                                axis=(1, 2, 3))
 
-    return out
+    return out, in_n, out_h, out_w, out_c
 
 
 class TestConv2dOp(OpTest):
@@ -85,8 +85,9 @@ class TestConv2dOp(OpTest):
 
         input = np.random.random(self.input_size).astype(self.dtype)
         filter = np.random.random(self.filter_size).astype(self.dtype)
-        output = conv2d_forward_naive(input, filter, self.groups,
-                                      conv2d_param).astype(self.dtype)
+        output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups,
+                                                  conv2d_param)
+        output = output.astype(self.dtype)
 
         self.inputs = {
             'Input': OpTest.np_dtype_to_fluid_dtype(input),
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
deleted file mode 100644
index aa19a5edc7..0000000000
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle
-import numpy as np
-
-
-class TestDataBalance(unittest.TestCase):
-    def prepare_data(self):
-        def fake_data_generator():
-            for n in range(self.total_ins_num):
-                yield np.ones((3, 4)) * n, n
-
-        # Prepare data
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            reader = paddle.batch(
-                fake_data_generator, batch_size=self.batch_size)
-            feeder = fluid.DataFeeder(
-                feed_list=[
-                    fluid.layers.data(
-                        name='image', shape=[3, 4], dtype='float32'),
-                    fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'),
-                ],
-                place=fluid.CPUPlace())
-            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
-                self.data_file_name, reader, feeder)
-
-    def prepare_lod_data(self):
-        def fake_data_generator():
-            for n in range(1, self.total_ins_num + 1):
-                d1 = (np.ones((n, 3)) * n).astype('float32')
-                d2 = (np.array(n).reshape((1, 1))).astype('int32')
-                yield d1, d2
-
-        # Prepare lod data
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            with fluid.recordio_writer.create_recordio_writer(
-                    filename=self.lod_data_file_name) as writer:
-                eof = False
-                generator = fake_data_generator()
-                while (not eof):
-                    data_batch = [
-                        np.array([]).reshape((0, 3)), np.array([]).reshape(
-                            (0, 1))
-                    ]
-                    lod = [0]
-                    for _ in range(self.batch_size):
-                        try:
-                            ins = next(generator)
-                        except StopIteration:
-                            eof = True
-                            break
-                        for i, d in enumerate(ins):
-                            data_batch[i] = np.concatenate(
-                                (data_batch[i], d), axis=0)
-                        lod.append(lod[-1] + ins[0].shape[0])
-                    if data_batch[0].shape[0] > 0:
-                        for i, d in enumerate(data_batch):
-                            t = fluid.LoDTensor()
-                            t.set(data_batch[i], fluid.CPUPlace())
-                            if i == 0:
-                                t.set_lod([lod])
-                            writer.append_tensor(t)
-                        writer.complete_append_tensor()
-
-    def setUp(self):
-        self.use_cuda = fluid.core.is_compiled_with_cuda()
-        self.data_file_name = './data_balance_test.recordio'
-        self.lod_data_file_name = './data_balance_with_lod_test.recordio'
-        self.total_ins_num = 50
-        self.batch_size = 12
-        self.prepare_data()
-        self.prepare_lod_data()
-
-    def main(self):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            data_reader = fluid.layers.io.open_files(
-                filenames=[self.data_file_name],
-                shapes=[[-1, 3, 4], [-1, 1]],
-                lod_levels=[0, 0],
-                dtypes=['float32', 'int64'])
-            if self.use_cuda:
-                data_reader = fluid.layers.double_buffer(data_reader)
-            image, label = fluid.layers.read_file(data_reader)
-
-            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
-
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.enable_data_balance = True
-            parallel_exe = fluid.ParallelExecutor(
-                use_cuda=self.use_cuda,
-                main_program=main_prog,
-                build_strategy=build_strategy)
-
-            if (parallel_exe.device_count > self.batch_size):
-                print("WARNING: Unittest TestDataBalance skipped. \
-                    For the result is not correct when device count \
-                    is larger than batch size.")
-                return
-            fetch_list = [image.name, label.name]
-
-            data_appeared = [False] * self.total_ins_num
-            while (True):
-                try:
-                    image_val, label_val = parallel_exe.run(fetch_list,
-                                                            return_numpy=True)
-                except fluid.core.EOFException:
-                    break
-                ins_num = image_val.shape[0]
-                broadcasted_label = np.ones(
-                    (ins_num, 3, 4)) * label_val.reshape((ins_num, 1, 1))
-                self.assertEqual(image_val.all(), broadcasted_label.all())
-                for l in label_val:
-                    self.assertFalse(data_appeared[l[0]])
-                    data_appeared[l[0]] = True
-            for i in data_appeared:
-                self.assertTrue(i)
-
-    def main_lod(self):
-        main_prog = fluid.Program()
-        startup_prog = fluid.Program()
-        with fluid.program_guard(main_prog, startup_prog):
-            data_reader = fluid.layers.io.open_files(
-                filenames=[self.lod_data_file_name],
-                shapes=[[-1, 3], [-1, 1]],
-                lod_levels=[1, 0],
-                dtypes=['float32', 'int32'])
-            ins, label = fluid.layers.read_file(data_reader)
-
-            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.enable_data_balance = True
-            parallel_exe = fluid.ParallelExecutor(
-                use_cuda=self.use_cuda,
-                main_program=main_prog,
-                build_strategy=build_strategy)
-
-            if parallel_exe.device_count > self.batch_size:
-                print("WARNING: Unittest TestDataBalance skipped. \
-                    For the result is not correct when device count \
-                    is larger than batch size.")
-                exit(0)
-            fetch_list = [ins.name, label.name]
-
-            data_appeared = [False] * self.total_ins_num
-            while (True):
-                try:
-                    ins_tensor, label_tensor = parallel_exe.run(
-                        fetch_list, return_numpy=False)
-                except fluid.core.EOFException:
-                    break
-
-                ins_val = np.array(ins_tensor)
-                label_val = np.array(label_tensor)
-                ins_lod = ins_tensor.lod()[0]
-                self.assertEqual(ins_val.shape[1], 3)
-                self.assertEqual(label_val.shape[1], 1)
-                self.assertEqual(len(ins_lod) - 1, label_val.shape[0])
-                for i in range(0, len(ins_lod) - 1):
-                    ins_elem = ins_val[ins_lod[i]:ins_lod[i + 1]][:]
-                    label_elem = label_val[i][0]
-                    self.assertEqual(ins_elem.all(), label_elem.all())
-                    self.assertFalse(data_appeared[int(label_elem - 1)])
-                    data_appeared[int(label_elem - 1)] = True
-
-            for i in data_appeared:
-                self.assertTrue(i)
-
-    def test_all(self):
-        self.main()
-        self.main_lod()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 07cc44aaa2..0caab08f0d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -442,10 +442,10 @@ class TestDistBase(unittest.TestCase):
         tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f"
         tr0_cmd = tr_cmd % \
                   (self._python_interp, model, self._ps_endpoints,
-                   0, w0_ep, self._lr / 2)
+                   0, w0_ep, self._lr)
         tr1_cmd = tr_cmd % \
                   (self._python_interp, model, self._ps_endpoints,
-                   1, w1_ep, self._lr / 2)
+                   1, w1_ep, self._lr)
 
         if self._mem_opt:
             tr0_cmd += " --mem_opt"
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index c2a4e5ca0c..28602d3251 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -15,6 +15,18 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import os
+
+
+def skip_ci(func):
+    on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0')))
+
+    def __func__(*args, **kwargs):
+        if on_ci:
+            return
+        return func(*args, **kwargs)
+
+    return __func__
 
 
 class TestDistSeResneXt2x2(TestDistBase):
@@ -22,6 +34,7 @@ class TestDistSeResneXt2x2(TestDistBase):
         self._sync_mode = True
         self._use_reader_alloc = False
 
+    @skip_ci
     def test_dist_train(self):
         self.check_with_place("dist_se_resnext.py", delta=1e-7)
 
@@ -32,6 +45,7 @@ class TestDistseResnXt2x2WithMemopt(TestDistBase):
         self._mem_opt = True
         self._use_reader_alloc = False
 
+    @skip_ci
     def test_dist_train(self):
         self.check_with_place("dist_se_resnext.py", delta=1e-7)
 
@@ -41,6 +55,7 @@ class TestDistSeResneXt2x2Async(TestDistBase):
         self._sync_mode = False
         self._use_reader_alloc = False
 
+    @skip_ci
     def test_dist_train(self):
         self.check_with_place("dist_se_resnext.py", delta=100)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index d9ad4e2e2c..3d1ce6b27c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -14,14 +14,15 @@
 
 from __future__ import print_function
 
+import traceback
 import math
+import collections
 
+import six
 import unittest
+import numpy as np
+
 import paddle.fluid as fluid
-from paddle.fluid.transpiler.distribute_transpiler import delete_ops
-import traceback
-import collections
-import six
 
 
 class TranspilerTest(unittest.TestCase):
@@ -520,7 +521,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
             'split_selected_rows', 'send', 'sequence_pool_grad',
             'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
             'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
-            'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat'
+            'recv', 'fetch_barrier'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
@@ -560,7 +561,7 @@ class TestDistLookupTable(TestDistLookupTableBase):
             'lookup_table_grad', 'split_selected_rows', 'send',
             'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
             'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier',
-            'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
+            'recv', 'recv', 'fetch_barrier'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
         startup_ops = [
@@ -607,8 +608,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
             'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
             'split_selected_rows', 'send', 'sequence_pool_grad',
             'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv',
-            'recv', 'concat', 'concat'
+            'sum', 'split_selected_rows', 'send', 'recv', 'recv'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
@@ -648,8 +648,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
             'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
             'lookup_table_grad', 'split_selected_rows', 'send',
             'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv',
-            'recv', 'concat'
+            'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
         startup_ops = [
@@ -824,5 +823,142 @@ class TestRemoteLookupTable(TestDistLookupTableBase):
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
 
+# test for remote prefetch
+class TestRemoteNce(TestDistLookupTableBase):
+    def network_with_table(self, is_sparse, is_distributed):
+
+        num_total_classes = 20
+        sampler = "uniform"
+        nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
+
+        input = fluid.layers.data(name="input", shape=[10], dtype="float32")
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+        w_param = fluid.default_main_program().global_block().create_parameter(
+            shape=[num_total_classes, 10],
+            dtype='float32',
+            name='nce_w',
+            initializer=fluid.initializer.ConstantInitializer())
+        b_param = fluid.default_main_program().global_block().create_parameter(
+            shape=[num_total_classes, 1],
+            dtype='float32',
+            name='nce_b',
+            initializer=fluid.initializer.ConstantInitializer())
+
+        cost = fluid.layers.nce(input=input,
+                                label=label,
+                                num_total_classes=num_total_classes,
+                                sampler=sampler,
+                                custom_dist=nid_freq_arr.tolist(),
+                                sample_weight=None,
+                                param_attr='nce_w',
+                                bias_attr='nce_b',
+                                seed=1,
+                                num_neg_samples=5,
+                                is_sparse=is_sparse)
+        avg_cost = fluid.layers.mean(cost)
+        # optimizer
+        optimizer = fluid.optimizer.Adam(learning_rate=0.003)
+        optimizer.minimize(avg_cost)
+
+    def net_conf(self):
+        import os
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        trainer, _ = self.get_trainer()
+
+        out_vars = ["nce_w"]
+        in_vars = ["nce_b"]
+
+        recv_var_names = []
+
+        for op in trainer.blocks[0].ops:
+            if op.type == "recv":
+                for var in op.output("Out"):
+                    recv_var_names.append(var)
+
+        for out_var in out_vars:
+            self.assertFalse(out_var in recv_var_names)
+        for in_var in in_vars:
+            self.assertTrue(in_var in recv_var_names)
+
+
+# test for remote prefetch
+class TestRemoteHsigmoid(TestDistLookupTableBase):
+    def network_with_table(self, is_sparse, is_distributed):
+
+        num_total_classes = 3
+
+        input = fluid.layers.data(name="input", shape=[1], dtype="float32")
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        path_table = fluid.layers.data(
+            name='path_table', shape=[3], dtype='int64')
+        path_code = fluid.layers.data(
+            name='path_code', shape=[3], dtype='int64')
+        w_param = fluid.default_main_program().global_block().create_parameter(
+            shape=[num_total_classes, 10],
+            dtype='float32',
+            name='hs_w',
+            initializer=fluid.initializer.ConstantInitializer())
+        b_param = fluid.default_main_program().global_block().create_parameter(
+            shape=[3, 1],
+            dtype='float32',
+            name='hs_b',
+            initializer=fluid.initializer.ConstantInitializer())
+
+        emb = fluid.layers.embedding(
+            input=input,
+            is_sparse=is_sparse,
+            size=[3, 3],
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                scale=1 / math.sqrt(num_total_classes))))
+
+        cost = fluid.layers.hsigmoid(
+            input=emb,
+            label=label,
+            num_classes=num_total_classes,
+            path_table=path_table,
+            path_code=path_code,
+            is_custom=True,
+            is_sparse=is_sparse)
+        avg_cost = fluid.layers.mean(cost)
+        # optimizer
+        optimizer = fluid.optimizer.SGD(learning_rate=0.003)
+        optimizer.minimize(avg_cost)
+
+    def net_conf(self):
+        import os
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        trainer, _ = self.get_trainer()
+        params_to_check = list()
+        for op in trainer.blocks[0].ops:
+            if op.type == "hierarchical_sigmoid":
+                params_to_check = [op.input("W")[0], op.input("Bias")[0]]
+                for name in ["epmap", "table_names", "epmap"]:
+                    assert op.has_attr(name)
+                    if name == "epmap":
+                        assert op.attr(name)[0] == u'127.0.0.1:6174'
+                    elif name == "table_names":
+                        assert op.attr(name)[0] == u'hierarchical_sigmoid_0.w_0'
+                    else:
+                        assert op.attr(name) == 3
+            elif op.type == "lookup_table":
+                params_to_check.append(op.input("W")[0])
+            else:
+                pass
+        op_count = 0
+        for op in trainer.blocks[0].ops:
+            if op.type == "recv":
+                assert len(op.output("Out")) == 1
+                assert op.output("Out")[0] == u'hierarchical_sigmoid_0.b_0'
+                op_count += 1
+        assert op_count == 1
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
index 89476ee641..81b0b66781 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -29,6 +29,12 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
         print('Skip use_cuda=True because Paddle is not compiled with cuda')
         return
 
+    if use_parallel_executor and os.name == 'nt':
+        print(
+            'Skip use_parallel_executor=True because Paddle comes without parallel support on windows'
+        )
+        return
+
     word_dict = paddle.dataset.imdb.word_dict()
     train_reader = paddle.batch(
         paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
new file mode 100644
index 0000000000..584e309bef
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.op import Operator
+import paddle.compat as cpt
+
+
+class TestFusedEmbeddingSeqPoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "fused_embedding_seq_pool"
+        self.emb_size = 2
+        table = np.random.random((17, self.emb_size)).astype("float32")
+        ids = np.array([[[4], [3]], [[4], [3]], [[2], [1]],
+                        [[16], [1]]]).astype("int64")
+        merged_ids = np.array([4, 2, 16]).astype("int64")
+        ids_expand = np.expand_dims(ids, axis=1)
+        self.lod = [[3, 1]]
+        self.attrs = {'is_sparse': True}
+        self.inputs = {'W': table, 'Ids': (ids_expand, self.lod)}
+        self.outputs = {
+            'Out': np.reshape(
+                np.array([
+                    table[[4, 3]] + table[[4, 3]] + table[[2, 1]],
+                    table[[16, 1]]
+                ]), [len(self.lod[0]), 2 * self.emb_size])
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 2a6c93f75f..8ed5074dc2 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -185,7 +185,7 @@ class TestHSigmoidOpSparse(OpTest):
         self.inputs = {
             'X': x,
             'W': w,
-            'PTable': path_table,
+            'PathTable': path_table,
             'PathCode': path_code,
             'Label': label,
             'Bias': bias
@@ -287,7 +287,7 @@ class TestHSigmoidOpWithCostumTree(OpTest):
         self.inputs = {
             'X': x,
             'W': w,
-            'PTable': path_table,
+            'PathTable': path_table,
             'PathCode': path_code,
             'Label': label,
             'Bias': bias
@@ -324,7 +324,7 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest):
         self.inputs = {
             'X': x,
             'W': w,
-            'PTable': path_table,
+            'PathTable': path_table,
             'PathCode': path_code,
             'Label': label,
         }
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
new file mode 100644
index 0000000000..da343dd503
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py
@@ -0,0 +1,269 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import signal
+import time
+import unittest
+from multiprocessing import Process
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.framework import Program, program_guard
+
+
+def run_pserver(pserver_id, use_cuda, sync_mode):
+    scope = fluid.core.Scope()
+    program = Program()
+    with fluid.scope_guard(scope):
+        with program_guard(program, startup_program=Program()):
+            # create table parameter in scope
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            # create and initialize Param Variable
+            param = scope.var('table').get_tensor()
+
+            param_array = np.ones((5, 8)).astype("float32")
+            for i in range(len(param_array)):
+                param_array[i] *= param_array[i] * i + pserver_id * 10 + 1
+            param.set(param_array, place)
+
+            optimize_block = program._create_block(program.global_block().idx)
+            program.global_block().append_op(
+                type="listen_and_serv",
+                inputs={'X': []},
+                outputs={},
+                attrs={
+                    "optimize_blocks": [optimize_block],
+                    "endpoint": '127.0.0.1:0',
+                    "Fanin": 1,
+                    "sync_mode": True,
+                    "grad_to_block_id": []
+                })
+
+            exe = fluid.Executor(place)
+            exe.run(program)
+
+
+class TestListenAndServOp(unittest.TestCase):
+    def setUp(self):
+        self.ps_timeout = 5
+
+    def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
+        p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
+        p.daemon = True
+        p.start()
+        return p
+
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
+    def _get_pserver_port(self, pid):
+        with open("/tmp/paddle.%d.port" % pid, 'r') as f:
+            port = int(f.read().strip())
+        return port
+
+    def _run_hsigmoid_op_one_pserver(self, place, port):
+        scope = fluid.core.Scope()
+        program = Program()
+        with fluid.scope_guard(scope):
+            with program_guard(program, startup_program=Program()):
+                x = scope.var('X').get_tensor()
+                x_array = np.random.random((4, 8)).astype("float32") * 2
+                x.set(x_array, place)
+                # create and initialize Param Variable
+                param = scope.var('W').get_tensor()
+                param_array = np.zeros((5, 8)).astype("float32") * 2
+                param.set(param_array, place)
+
+                path_table = scope.var('PathTable').get_tensor()
+                path_table_array = np.array(
+                    [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1),
+                     (0, 2, -1, -1, -1)]).astype(
+                         "int64"
+                     )  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
+                path_table.set(path_table_array, place)
+
+                path_code = scope.var('PathCode').get_tensor()
+                path_code_array = np.array(
+                    [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
+                     (0, 1, -1, -1, -1)]).astype("int64")  #np.array to store 
+                path_code.set(path_code_array, place)
+
+                label = scope.var('Label').get_tensor()
+                label_array = np.array([0, 1, 4, 5])
+                label.set(label_array, place)
+
+                bias = scope.var('Bias').get_tensor()
+                bias_array = np.random.random((5, 1)).astype("float32")
+                bias.set(bias_array, place)
+
+                out = scope.var('Out').get_tensor()
+
+                pre_out = scope.var('PreOut').get_tensor
+
+                w_out = scope.var('W_Out').get_tensor()
+                w_out.set(param_array, place)
+
+                emaps = ['127.0.0.1:' + str(port)]
+                table_names = ['table']
+                height_sections = [2]
+
+                # create and run sgd operator
+                hsigmoid_op = Operator(
+                    "hierarchical_sigmoid",
+                    X='X',
+                    W='W',
+                    PathTable='PathTable',
+                    PathCode='PathCode',
+                    Label='Label',
+                    Bias='Bias',
+                    Out='Out',
+                    PreOut='PreOut',
+                    W_Out='W_Out',
+                    remote_prefetch=True,
+                    epmap=emaps,
+                    table_names=table_names,
+                    height_sections=height_sections)
+
+                hsigmoid_op.run(scope, place)
+
+                # get and compare result
+                result_array = np.array(w_out)
+                self.assertEqual(list(result_array.shape), [5, 8])
+                correct = None
+                for i in range(5):
+                    if i != 3:
+                        correct = np.full((1, 8), i + 1).astype("float32")
+                        self.assertTrue((result_array[i] == correct).all())
+                    else:
+                        correct = np.full((1, 8), 0).astype("float32")
+                        self.assertTrue((result_array[i] == correct).all())
+
+    def _run_hsigmoid_op_two_pserver(self, place, port0, port1):
+        scope = fluid.core.Scope()
+        program = Program()
+        with fluid.scope_guard(scope):
+            with program_guard(program, startup_program=Program()):
+                x = scope.var('X').get_tensor()
+                x_array = np.random.random((4, 8)).astype("float32") * 2
+                x.set(x_array, place)
+                # create and initialize Param Variable
+                param = scope.var('W').get_tensor()
+                param_array = np.zeros((5, 8)).astype("float32") * 2
+                param.set(param_array, place)
+
+                path_table = scope.var('PathTable').get_tensor()
+                path_table_array = np.array(
+                    [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
+                     (0, 2, -1, -1, -1)]).astype(
+                         "int64"
+                     )  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
+                path_table.set(path_table_array, place)
+
+                path_code = scope.var('PathCode').get_tensor()
+                path_code_array = np.array(
+                    [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
+                     (0, 1, -1, -1, -1)]).astype("int64")  #np.array to store 
+                path_code.set(path_code_array, place)
+
+                label = scope.var('Label').get_tensor()
+                label_array = np.array([0, 1, 4, 5])
+                label.set(label_array, place)
+
+                bias = scope.var('Bias').get_tensor()
+                bias_array = np.random.random((5, 1)).astype("float32")
+                bias.set(bias_array, place)
+
+                out = scope.var('Out').get_tensor()
+
+                pre_out = scope.var('PreOut').get_tensor
+
+                w_out = scope.var('W_Out').get_tensor()
+                w_out.set(param_array, place)
+
+                emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
+                table_names = ['table', 'table']
+                height_sections = [2, 3]
+
+                # create and run sgd operator
+                hsigmoid_op = Operator(
+                    "hierarchical_sigmoid",
+                    X='X',
+                    W='W',
+                    PathTable='PathTable',
+                    PathCode='PathCode',
+                    Label='Label',
+                    Bias='Bias',
+                    Out='Out',
+                    PreOut='PreOut',
+                    W_Out='W_Out',
+                    remote_prefetch=True,
+                    epmap=emaps,
+                    table_names=table_names,
+                    height_sections=height_sections)
+                hsigmoid_op.run(scope, place)
+
+                # get and compare result
+                result_array = np.array(w_out)
+                self.assertEqual(list(result_array.shape), [5, 8])
+                correct = None
+                for i in range(5):
+                    if i < 2:
+                        correct = np.full((1, 8), i + 1).astype("float32")
+                        self.assertTrue((result_array[i] == correct).all())
+                    else:
+                        correct = np.full((1, 8), i + 9).astype("float32")
+                        self.assertTrue((result_array[i] == correct).all())
+
+    def test_hsigmoid_op_remote(self):
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
+        # run pserver on CPU in sync mode
+        p0 = self._start_pserver(0, False, True, run_pserver)
+        self._wait_ps_ready(p0.pid)
+        port0 = self._get_pserver_port(p0.pid)
+
+        p1 = self._start_pserver(1, False, True, run_pserver)
+        self._wait_ps_ready(p1.pid)
+        port1 = self._get_pserver_port(p1.pid)
+
+        places = [core.CPUPlace()]
+
+        for place in places:
+            self._run_hsigmoid_op_one_pserver(place, port0)
+            self._run_hsigmoid_op_two_pserver(place, port0, port1)
+
+        # raise SIGTERM to pserver
+        os.kill(p0.pid, signal.SIGINT)
+        p0.join()
+        os.kill(p1.pid, signal.SIGINT)
+        p1.join()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index 0fe69d1bd4..1dc13ec74e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -18,17 +18,8 @@ import numpy as np
 
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.layers.nn import FC
-
-
-@contextlib.contextmanager
-def new_program_scope():
-    prog = fluid.Program()
-    startup_prog = fluid.Program()
-    scope = fluid.core.Scope()
-    with fluid.scope_guard(scope):
-        with fluid.program_guard(prog, startup_prog):
-            yield
+from paddle.fluid.imperative.nn import FC
+from test_imperative_base import new_program_scope
 
 
 class MyLayer(fluid.imperative.PyLayer):
@@ -36,9 +27,11 @@ class MyLayer(fluid.imperative.PyLayer):
         super(MyLayer, self).__init__()
 
     def forward(self, inputs):
-        x = fluid.layers.relu(inputs[0])
+        x = fluid.layers.relu(inputs)
         self._x_for_debug = x
-        return [fluid.layers.elementwise_mul(x, x)]
+        x = fluid.layers.elementwise_mul(x, x)
+        x = fluid.layers.reduce_sum(x)
+        return [x]
 
 
 class MLP(fluid.imperative.PyLayer):
@@ -52,7 +45,7 @@ class MLP(fluid.imperative.PyLayer):
                            initializer=fluid.initializer.Constant(value=0.1)))
 
     def forward(self, inputs):
-        x = self._fc1(inputs[0])
+        x = self._fc1(inputs)
         x = self._fc2(x)
         x = fluid.layers.reduce_sum(x)
         return x
@@ -64,13 +57,14 @@ class TestImperative(unittest.TestCase):
             cl = core.Layer()
             cl.forward([])
             l = fluid.imperative.PyLayer()
-            l.forward([])
+            self.assertRaises(NotImplementedError, l.forward, [])
 
     def test_layer_in_out(self):
         np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.imperative.guard():
+            var_inp = fluid.imperative.base.to_variable(np_inp)
             l = MyLayer()
-            x = l(np_inp)[0]
+            x = l(var_inp)[0]
             self.assertIsNotNone(x)
             dy_out = x._numpy()
             x._backward()
@@ -95,8 +89,9 @@ class TestImperative(unittest.TestCase):
     def test_mlp(self):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with fluid.imperative.guard():
+            var_inp = fluid.imperative.base.to_variable(np_inp)
             mlp = MLP()
-            out = mlp(np_inp)
+            out = mlp(var_inp)
             dy_out = out._numpy()
             out._backward()
             dy_grad = mlp._fc1._w._gradient()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_base.py b/python/paddle/fluid/tests/unittests/test_imperative_base.py
new file mode 100644
index 0000000000..478cc13fb5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_base.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+@contextlib.contextmanager
+def new_program_scope():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
new file mode 100644
index 0000000000..5d97edf876
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+import six
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from paddle.fluid.imperative.base import to_variable
+from test_imperative_base import new_program_scope
+
+
+class SimpleImgConvPool(fluid.imperative.PyLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 pool_stride,
+                 pool_padding=0,
+                 pool_type='max',
+                 global_pooling=False,
+                 conv_stride=1,
+                 conv_padding=0,
+                 conv_dilation=1,
+                 conv_groups=1,
+                 act=None,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(SimpleImgConvPool, self).__init__()
+
+        self._conv2d = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=conv_stride,
+            padding=conv_padding,
+            dilation=conv_dilation,
+            groups=conv_groups,
+            param_attr=None,
+            bias_attr=None,
+            use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(
+            pool_size=pool_size,
+            pool_type=pool_type,
+            pool_stride=pool_stride,
+            pool_padding=pool_padding,
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn)
+
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = self._pool2d(x)
+        return x
+
+
+class MNIST(fluid.imperative.PyLayer):
+    def __init__(self, param_attr=None, bias_attr=None):
+        super(MNIST, self).__init__()
+
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(
+            1, 20, 5, 2, 2, act="relu")
+
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(
+            20, 50, 5, 2, 2, act="relu")
+
+        pool_2_shape = 50 * 8 * 8
+        SIZE = 10
+        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
+        self._fc = FC(10,
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.NormalInitializer(
+                              loc=0.0, scale=scale)))
+
+    def forward(self, inputs):
+        x = self._simple_img_conv_pool_1(inputs)
+        x = self._simple_img_conv_pool_2(x)
+        x = self._fc(x)
+        return x
+
+
+class TestImperativeMnist(unittest.TestCase):
+    def test_mnist_cpu_float32(self):
+        seed = 90
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            #  mnist = Conv2D(1, 20, 5)
+            mnist = MNIST()
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128)
+
+            dy_param_init_value = {}
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= 2:
+                    break
+
+                x_data = np.array(
+                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    128, 1)
+
+                img = to_variable(x_data)
+                label = to_variable(y_data)
+                label._stop_gradient = True
+
+                cost = mnist(img)
+                loss = fluid.layers.reduce_mean(cost)
+                dy_out = loss._numpy()
+
+                if batch_id == 0:
+                    for param in fluid.default_main_program().global_block(
+                    ).all_parameters():
+                        dy_param_init_value[param.name] = param._numpy()
+
+                loss._backward()
+                sgd.minimize(loss)
+                dy_param_value = {}
+                for param in fluid.default_main_program().global_block(
+                ).all_parameters():
+                    dy_param_value[param.name] = param._numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            exe = fluid.Executor(fluid.CPUPlace())
+
+            #  mnist = Conv2D(1, 20, 5)
+            mnist = MNIST()
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128)
+
+            img = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            cost = mnist(img)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd.minimize(loss)
+
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            for param in fluid.default_startup_program().global_block(
+            ).all_parameters():
+                static_param_name_list.append(param.name)
+
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= 2:
+                    break
+
+                x_data = np.array(
+                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    [128, 1])
+
+                fetch_list = [loss.name]
+                fetch_list.extend(static_param_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={"pixel": x_data,
+                                    "label": y_data},
+                              fetch_list=fetch_list)
+
+                static_param_value = {}
+                static_out = out[0]
+                for i in range(1, len(out)):
+                    static_param_value[static_param_name_list[i - 1]] = out[i]
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(
+                np.allclose(value.all(), dy_param_init_value[key].all()))
+        self.assertTrue(np.allclose(static_out.all(), dy_out.all()))
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.allclose(value.all(), dy_param_value[key].all()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index e180822c2b..90f5d797a6 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -243,6 +243,10 @@ class TestBook(unittest.TestCase):
             pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True)
             self.assertIsNotNone(pool)
             self.assertIsNotNone(mask)
+            self.assertIsNotNone(layers.adaptive_pool2d(x, 3, pool_type='avg'))
+            pool, mask = layers.adaptive_pool2d(x, 3, require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
 
     def test_adaptive_pool3d(self):
         program = Program()
@@ -255,6 +259,10 @@ class TestBook(unittest.TestCase):
                 x, [3, 3, 3], require_index=True)
             self.assertIsNotNone(pool)
             self.assertIsNotNone(mask)
+            self.assertIsNotNone(layers.adaptive_pool3d(x, 3, pool_type='avg'))
+            pool, mask = layers.adaptive_pool3d(x, 3, require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
 
     def test_lstm_unit(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
new file mode 100644
index 0000000000..cc6f40de86
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
@@ -0,0 +1,236 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import signal
+import time
+import unittest
+from multiprocessing import Process
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.framework import Program, program_guard
+
+
+def nce(input, weight, bias, sample_weight, labels, num_classes,
+        num_sample_class):
+    samples = []
+    sample_labels = []
+    batch_size = input.shape[0]
+    num_true_class = labels.shape[1]
+    for i in range(batch_size):
+        w = 1 if sample_weight is None else sample_weight[i]
+        for label in labels[i]:
+            samples.append((i, label, True, w))
+            sample_labels.append(label)
+        for num in range(num_sample_class):
+            samples.append((i, num, False, w))
+            sample_labels.append(num)
+    # forward bias
+    sample_out = np.zeros(len(samples)).astype(np.float32)
+    if bias is not None:
+        for i in range(len(samples)):
+            sample_out[i] = bias[samples[i][1]]
+    # forward weight
+    for i in range(len(samples)):
+        sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]])
+
+    # forward activation
+    sample_out = 1.0 / (1.0 + np.exp(-sample_out))
+    # forward cost
+    out = np.zeros(batch_size).astype(np.float32)
+    b = 1.0 / num_classes * num_sample_class
+
+    for i in range(len(samples)):
+        o = sample_out[i]
+        cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
+        out[samples[i][0]] += cost * samples[i][3]
+    return (out[:, np.newaxis], np.array(sample_out).reshape(
+        batch_size, num_sample_class + num_true_class),
+            np.array(sample_labels).reshape(batch_size,
+                                            num_sample_class + num_true_class))
+
+
+def run_pserver(pserver_id, use_cuda, sync_mode):
+    scope = fluid.core.Scope()
+    program = Program()
+    with fluid.scope_guard(scope):
+        with program_guard(program, startup_program=Program()):
+            # create table parameter in scope
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            # create and initialize Param Variable
+            param = scope.var('table').get_tensor()
+
+            param_array = np.ones((5, 8)).astype("float32")
+            for i in range(len(param_array)):
+                param_array[i] *= param_array[i] * i + pserver_id * 10 + 1
+            param.set(param_array, place)
+
+            optimize_block = program._create_block(program.global_block().idx)
+            program.global_block().append_op(
+                type="listen_and_serv",
+                inputs={'X': []},
+                outputs={},
+                attrs={
+                    "optimize_blocks": [optimize_block],
+                    "endpoint": '127.0.0.1:0',
+                    "Fanin": 1,
+                    "sync_mode": True,
+                    "grad_to_block_id": []
+                })
+
+            exe = fluid.Executor(place)
+            exe.run(program)
+
+
+class TestListenAndServOp(unittest.TestCase):
+    def setUp(self):
+        self.ps_timeout = 5
+
+    def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
+        p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
+        p.daemon = True
+        p.start()
+        return p
+
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
+    def _get_pserver_port(self, pid):
+        with open("/tmp/paddle.%d.port" % pid, 'r') as f:
+            port = int(f.read().strip())
+        return port
+
+    def _run_nce_op_two_pserver(self, place, port0, port1):
+        scope = fluid.core.Scope()
+        program = Program()
+        with fluid.scope_guard(scope):
+            with program_guard(program, startup_program=Program()):
+                x = scope.var('Input').get_tensor()
+                x_array = np.random.random((4, 8)).astype("float32")
+                x.set(x_array, place)
+                # create and initialize Param Variable
+                param = scope.var('Weight').get_tensor()
+                param_array = np.zeros((5, 8)).astype("float32")
+                param.set(param_array, place)
+
+                bias = scope.var('Bias').get_tensor()
+                bias_array = np.random.random((5, 1)).astype("float32")
+                bias.set(bias_array, place)
+
+                sample_w = scope.var('SampleWeight').get_tensor()
+                sample_weight = np.random.random((4, 1)).astype("float32")
+                sample_w.set(sample_weight, place)
+
+                label = scope.var('Label').get_tensor()
+                label_array = np.array([[0], [1], [4], [3]])
+                label.set(label_array, place)
+
+                cost = scope.var('Cost').get_tensor()
+                cost_w = np.zeros((4, 1)).astype("float32")
+                cost.set(cost_w, place)
+
+                sample_l = scope.var('SampleLogits').get_tensor()
+                sample_l_w = np.zeros((4, 3)).astype("float32")
+                sample_l.set(sample_l_w, place)
+
+                sample_la = scope.var('SampleLabels').get_tensor()
+                sample_la_w = np.zeros((4, 3)).astype("int")
+                sample_la.set(sample_la_w, place)
+
+                emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
+                table_names = ['table', 'table']
+                height_sections = [2, 3]
+
+                # create and run nce operator
+                nce_op = Operator(
+                    "nce",
+                    Input='Input',
+                    Weight='Weight',
+                    Label='Label',
+                    Bias='Bias',
+                    Cost='Cost',
+                    SampleLogits='SampleLogits',
+                    SampleLabels='SampleLabels',
+                    SampleWeight='SampleWeight',
+                    num_total_classes=5,
+                    num_neg_samples=2,
+                    custom_neg_classes=list(range(2)),
+                    sampler=0,
+                    seed=0,
+                    is_sparse=True,
+                    remote_prefetch=True,
+                    epmap=emaps,
+                    table_names=table_names,
+                    height_sections=height_sections)
+
+                nce_op.run(scope, place)
+
+                # get and compare result
+                o_cost = np.array(scope.var('Cost').get_tensor())
+                o_logits = np.array(scope.var('SampleLogits').get_tensor())
+                o_labels = np.array(scope.var('SampleLabels').get_tensor())
+
+                param_array = np.ones((5, 8)).astype("float32")
+                for i in range(2):
+                    param_array[i] *= param_array[i] * i + 0 * 10 + 1
+                for i in range(2, 5):
+                    param_array[i] *= param_array[i] * i + 1 * 10 + 1
+                out = nce(x_array, param_array, bias_array, sample_weight,
+                          label_array, 5, 2)
+
+                self.assertAlmostEqual(o_cost.all(), out[0].all(), delta=1e-6)
+                self.assertAlmostEqual(o_logits.all(), out[1].all(), delta=1e-6)
+                self.assertAlmostEqual(o_labels.all(), out[2].all(), delta=1e-6)
+
+    def test_nce_op_remote(self):
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
+        # run pserver on CPU in sync mode
+        p0 = self._start_pserver(0, False, True, run_pserver)
+        self._wait_ps_ready(p0.pid)
+        port0 = self._get_pserver_port(p0.pid)
+
+        p1 = self._start_pserver(1, False, True, run_pserver)
+        self._wait_ps_ready(p1.pid)
+        port1 = self._get_pserver_port(p1.pid)
+
+        places = [core.CPUPlace()]
+
+        for place in places:
+            self._run_nce_op_two_pserver(place, port0, port1)
+
+        # raise SIGTERM to pserver
+        os.kill(p0.pid, signal.SIGINT)
+        p0.join()
+        os.kill(p1.pid, signal.SIGINT)
+        p1.join()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 37b9a9188a..4153394c1d 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase):
             set(mul_op.attr_names),
             set([
                 "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
-                "op_namescope", "op_callstack"
+                "op_namescope"
             ]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 84b0aad8ac..1c6cfce0c2 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -175,41 +175,61 @@ class TestCRFModel(unittest.TestCase):
                 print(pe.run(feed=feeder.feed(cur_batch),
                              fetch_list=[avg_cost.name])[0])
 
-    def test_update_sparse_parameter_all_reduce(self):
+    def _new_build_strategy(self, use_reduce=False):
         build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+
+        if use_reduce:
+            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        else:
+            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+
+        return build_strategy
+
+    def test_update_sparse_parameter_all_reduce(self):
         if core.is_compiled_with_cuda():
             self.check_network_convergence(
-                is_sparse=True, build_strategy=build_strategy, use_cuda=True)
+                is_sparse=True,
+                build_strategy=self._new_build_strategy(),
+                use_cuda=True)
+
         self.check_network_convergence(
-            is_sparse=True, build_strategy=build_strategy, use_cuda=False)
+            is_sparse=True,
+            build_strategy=self._new_build_strategy(),
+            use_cuda=False)
 
     def test_update_dense_parameter_all_reduce(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
         if core.is_compiled_with_cuda():
             self.check_network_convergence(
-                is_sparse=False, build_strategy=build_strategy, use_cuda=True)
+                is_sparse=False,
+                build_strategy=self._new_build_strategy(),
+                use_cuda=True)
+
         self.check_network_convergence(
-            is_sparse=False, build_strategy=build_strategy, use_cuda=False)
+            is_sparse=False,
+            build_strategy=self._new_build_strategy(),
+            use_cuda=False)
 
     def test_update_sparse_parameter_reduce(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
         if core.is_compiled_with_cuda():
             self.check_network_convergence(
-                is_sparse=True, build_strategy=build_strategy, use_cuda=True)
+                is_sparse=True,
+                build_strategy=self._new_build_strategy(use_reduce=True),
+                use_cuda=True)
         self.check_network_convergence(
-            is_sparse=True, build_strategy=build_strategy, use_cuda=False)
+            is_sparse=True,
+            build_strategy=self._new_build_strategy(use_reduce=True),
+            use_cuda=False)
 
     def test_update_dense_parameter_reduce(self):
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
         if core.is_compiled_with_cuda():
             self.check_network_convergence(
-                is_sparse=False, build_strategy=build_strategy, use_cuda=True)
+                is_sparse=False,
+                build_strategy=self._new_build_strategy(use_reduce=True),
+                use_cuda=True)
         self.check_network_convergence(
-            is_sparse=False, build_strategy=build_strategy, use_cuda=False)
+            is_sparse=False,
+            build_strategy=self._new_build_strategy(use_reduce=True),
+            use_cuda=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 3eecc46701..9768f7db26 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -86,6 +86,7 @@ class TestMNIST(TestParallelExecutorBase):
                        "label": label},
             use_cuda=use_cuda,
             use_reduce=False)
+
         reduce_first_loss, reduce_last_loss = self.check_network_convergence(
             model,
             feed_dict={"image": img,
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py
new file mode 100644
index 0000000000..f4495d0bc8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_int8_mkldnn_op.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from __future__ import division
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive
+
+
+class TestPool2dMKLDNNInt8_Op(TestPool2D_Op):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_data_type(self):
+        self.dtype = np.int8
+
+    def setUp(self):
+        TestPool2D_Op.setUp(self)
+        assert self.dtype in [np.int8, np.uint8
+                              ], 'Dtype should be int8 or uint8'
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), atol=1e-5)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestCase1Avg(TestPool2dMKLDNNInt8_Op):
+    def init_test_case(self):
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestCase2Avg(TestPool2dMKLDNNInt8_Op):
+    def init_test_case(self):
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestCase0Max(TestPool2dMKLDNNInt8_Op):
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+class TestCase1Max(TestCase1Avg):
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+class TestCase2Max(TestCase2Avg):
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+def create_test_s8_u8_class(parent):
+    class TestS8Case(parent):
+        def init_data_type(self):
+            self.dtype = np.int8
+
+    class TestU8Case(parent):
+        def init_data_type(self):
+            self.dtype = np.uint8
+
+    cls_name_s8 = "{0}_{1}".format(parent.__name__, "mkldnn_s8")
+    cls_name_u8 = "{0}_{1}".format(parent.__name__, "mkldnn_u8")
+    TestS8Case.__name__ = cls_name_s8
+    TestU8Case.__name__ = cls_name_u8
+    globals()[cls_name_s8] = TestS8Case
+    globals()[cls_name_u8] = TestU8Case
+
+
+create_test_s8_u8_class(TestPool2dMKLDNNInt8_Op)
+create_test_s8_u8_class(TestCase1Avg)
+create_test_s8_u8_class(TestCase2Avg)
+create_test_s8_u8_class(TestCase0Max)
+create_test_s8_u8_class(TestCase1Max)
+create_test_s8_u8_class(TestCase2Max)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
index 19f29c7826..7de5fefc14 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
@@ -18,35 +18,22 @@ import unittest
 from test_pool2d_op import TestPool2D_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
 
 
-class TestMKLDNNCase1(TestPool2D_Op):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase2(TestCase1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase3(TestCase2):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase6(TestCase5):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
+def create_test_mkldnn_class(parent):
+    class TestMKLDNNCase(parent):
+        def init_kernel_type(self):
+            self.use_mkldnn = True
+
+    cls_name = "{0}_{1}".format(parent.__name__, "MKLDNNOp")
+    TestMKLDNNCase.__name__ = cls_name
+    globals()[cls_name] = TestMKLDNNCase
+
+
+create_test_mkldnn_class(TestPool2D_Op)
+create_test_mkldnn_class(TestCase1)
+create_test_mkldnn_class(TestCase2)
+create_test_mkldnn_class(TestCase3)
+create_test_mkldnn_class(TestCase4)
+create_test_mkldnn_class(TestCase5)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 5ccdf082e8..92515add59 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -115,7 +115,7 @@ class TestPool2D_Op(OpTest):
         self.op_type = "pool2d"
         self.use_cudnn = False
         self.use_mkldnn = False
-        self.dtype = np.float32
+        self.init_data_type()
         self.init_test_case()
         self.init_global_pool()
         self.init_kernel_type()
@@ -177,6 +177,9 @@ class TestPool2D_Op(OpTest):
     def init_kernel_type(self):
         pass
 
+    def init_data_type(self):
+        self.dtype = np.float32
+
     def init_pool_type(self):
         self.pool_type = "avg"
         self.pool2D_forward_naive = avg_pool2D_forward_naive
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index 943ad3ed22..655378f7f8 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -26,7 +26,7 @@ os.environ['CPU_NUM'] = str(dev_cnt)
 
 
 def dummy_func_with_no_input():
-    return float(1.0)
+    return np.array([0], dtype='float32')
 
 
 def dummy_func_with_no_output(x):
@@ -105,7 +105,7 @@ def simple_fc_net(img, label, use_py_func_op):
             name='test_tmp_var', dtype='float32', shape=[1])
         fluid.layers.py_func(
             func=dummy_func_with_no_input, x=None, out=dummy_var)
-
+        loss += dummy_var
         fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None)
 
     loss = fluid.layers.mean(loss)
@@ -174,7 +174,7 @@ class TestPyFuncOpUseExecutor(unittest.TestCase):
             self.assertAlmostEqual(max_diff, 0, delta=1e-3)
 
 
-class TestPyFuncOpUseParallelExecutor(unittest.TestCase):
+class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor):
     def setUp(self):
         self.use_parallel_executor = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index d94494e219..559386545e 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -209,6 +209,7 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
             else:
                 thread = threading.Thread(
                     target=feed_data, args=(feed_queue, reader))
+                thread.daemon = True
                 thread.start()
 
             self.outputs = []
@@ -219,6 +220,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
 
             feed_queue.close()
             self.validate()
+            if not use_decorate_paddle_reader:
+                thread.join()
 
     def validate(self):
         self.assertEqual(len(self.inputs), len(self.outputs))
diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
index e97a05b6f9..7eeffa1039 100644
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -75,8 +75,6 @@ class TestReaderReset(unittest.TestCase):
         exe.run(startup_prog)
 
         build_strategy = fluid.BuildStrategy()
-        if with_double_buffer:
-            build_strategy.enable_data_balance = True
         exec_strategy = fluid.ExecutionStrategy()
         parallel_exe = fluid.ParallelExecutor(
             use_cuda=self.use_cuda,
diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py
new file mode 100644
index 0000000000..f37d2bfb2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py
@@ -0,0 +1,188 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import contextlib
+
+import unittest
+from functools import partial
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+import paddle.fluid as fluid
+
+
+def get_places():
+    places = []
+    if core.is_compiled_with_cuda():
+        places.append(core.CUDAPlace(0))
+    return places
+
+
+@contextlib.contextmanager
+def prog_scope_guard(main_prog, startup_prog):
+    scope = fluid.core.Scope()
+    with fluid.unique_name.guard():
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(main_prog, startup_prog):
+                yield
+
+
+def bow_net(data,
+            label,
+            dict_dim,
+            is_sparse=False,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    fluid/PaddleNLP/text_classification/nets.py
+    """
+    emb = fluid.layers.embedding(
+        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost
+
+
+class TestWeightDecay(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        reader = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict), batch_size=4)()
+        self.train_data = [next(reader) for _ in range(5)]
+        self.learning_rate = .5
+
+    def run_executor(self, place, feed_list, loss):
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(fluid.default_startup_program())
+        main_prog = fluid.default_main_program()
+        loss_set = []
+        for data in self.train_data:
+            out = exe.run(main_prog,
+                          feed=feeder.feed(data),
+                          fetch_list=[loss.name])
+
+            print("loss              %s" % (np.average(out)))
+            loss_set.append(np.average(out))
+
+        return loss_set
+
+    def run_parallel_exe(self,
+                         place,
+                         feed_list,
+                         loss,
+                         use_cuda=True,
+                         use_reduce=False,
+                         use_fast_executor=False,
+                         use_ir_memory_optimize=False):
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(fluid.default_startup_program())
+
+        exec_strategy = fluid.ExecutionStrategy()
+        if use_fast_executor:
+            exec_strategy.use_experimental_executor = True
+
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
+                if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
+        build_strategy.memory_optimize = use_ir_memory_optimize
+
+        parallel_exe = fluid.ParallelExecutor(
+            use_cuda,
+            loss_name=loss.name,
+            exec_strategy=exec_strategy,
+            build_strategy=build_strategy)
+
+        loss_set = []
+        for data in self.train_data:
+            out = parallel_exe.run(feed=feeder.feed(data),
+                                   fetch_list=[loss.name])
+            print("loss              %s" % (np.average(out)))
+            loss_set.append(np.average(out))
+
+        return loss_set
+
+    def check_weight_decay(self,
+                           place,
+                           model,
+                           use_parallel_exe=False,
+                           use_reduce=False):
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        startup_prog.random_seed = 1
+        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
+
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+            avg_cost = model(data, label, len(self.word_dict))
+
+            param_list = [(var, var * self.learning_rate)
+                          for var in main_prog.block(0).all_parameters()]
+
+            optimizer = fluid.optimizer.Adagrad(
+                learning_rate=self.learning_rate)
+
+            optimizer.minimize(avg_cost)
+
+            for params in param_list:
+                updated_p = fluid.layers.elementwise_sub(
+                    x=params[0], y=params[1])
+                fluid.layers.assign(input=updated_p, output=params[0])
+
+            if use_parallel_exe:
+                loss = self.run_parallel_exe(
+                    place, [data, label],
+                    loss=avg_cost,
+                    use_cuda=True,
+                    use_reduce=use_reduce)
+            else:
+                loss = self.run_executor(place, [data, label], loss=avg_cost)
+
+        return loss
+
+    def test_weight_decay(self):
+        model = partial(bow_net, is_sparse=False)
+        for place in get_places():
+            loss = self.check_weight_decay(place, model, use_parallel_exe=False)
+
+            loss2 = self.check_weight_decay(
+                place, model, use_parallel_exe=True, use_reduce=False)
+
+            for i in range(len(loss)):
+                assert np.isclose(a=loss[i], b=loss2[i], rtol=5e-5)
+
+            loss3 = self.check_weight_decay(
+                place, model, use_parallel_exe=True, use_reduce=True)
+
+            for i in range(len(loss)):
+                assert np.isclose(a=loss[i], b=loss3[i], rtol=5e-5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index dc3b2cb8bc..c4eb26893c 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -137,9 +137,9 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
     var_dict = {}
     for var_proto in proto_list:
         var_name = str(var_proto.name)
+        if (var_name not in np_list) and var_proto.dispensable:
+            continue
         if is_input:
-            if (var_name not in np_list) and var_proto.dispensable:
-                continue
             assert (var_name in np_list) or (var_proto.dispensable), \
                 "Missing {} as input".format(var_name)
         if var_proto.duplicable:
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index f223d86554..8d11db376d 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size):
 
 class DistributeTranspilerConfig(object):
     """
-    Args:
-        slice_var_up (bool): Do Tensor slice for pservers, default is True.
-        split_method (PSDispatcher): RoundRobin or HashName can be used
-          try to choose the best method to balance loads for pservers.
-        min_block_size (int): Minimum splitted element number in block.
-          According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
+    .. py:attribute:: slice_var_up (bool)
+
+          Do Tensor slice for pservers, default is True.
+
+    .. py:attribute:: split_method (PSDispatcher)
+
+          RoundRobin or HashName can be used.
+          Try to choose the best method to balance loads for pservers.
+
+    .. py:attribute:: min_block_size (int)
+
+          Minimum number of splitted elements in block.
+
+          According to : https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
           We can use bandwidth effiently when data size is larger than 2MB.If you
-          want to change it, please be sure you see the slice_variable function.
+          want to change it, please be sure you have read the slice_variable function.
+
     """
 
     slice_var_up = True
@@ -242,11 +251,10 @@ class DistributeTranspiler(object):
 
     def _get_all_remote_sparse_update_op(self, main_program):
         sparse_update_ops = []
-        sparse_update_op_types = ["lookup_table"]
+        sparse_update_op_types = ["lookup_table", "nce", "hierarchical_sigmoid"]
         for op in main_program.global_block().ops:
             if op.type in sparse_update_op_types and op.attr(
-                    'remote_prefetch') is True and not op.attr(
-                        'is_distributed'):
+                    'remote_prefetch') is True:
                 sparse_update_ops.append(op)
         return sparse_update_ops
 
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index ccf7af334d..cc7f5ec90c 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -57,7 +57,7 @@ class InferenceTranspiler(object):
             raise TypeError("place should be as CPUPlace/CUDAPlace type")
         if scope is None:
             scope = global_scope()
-        if not isinstance(scope, core.Scope):
+        if not isinstance(scope, core._Scope):
             raise TypeError("scope should be as Scope type or None")
         use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
 
diff --git a/tools/check_doc_approval.py b/tools/check_doc_approval.py
new file mode 100644
index 0000000000..44fdf58b49
--- /dev/null
+++ b/tools/check_doc_approval.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import ast
+import hashlib
+import importlib
+import paddle.fluid
+
+files = [
+    "paddle.fluid", "paddle.fluid.average", "paddle.fluid.backward",
+    "paddle.fluid.clip", "paddle.fluid.data_feeder", "paddle.fluid.executor",
+    "paddle.fluid.initializer", "paddle.fluid.io", "paddle.fluid.layers",
+    "paddle.fluid.metrics", "paddle.fluid.nets", "paddle.fluid.optimizer",
+    "paddle.fluid.profiler", "paddle.fluid.recordio_writer",
+    "paddle.fluid.regularizer", "paddle.fluid.transpiler"
+]
+
+
+def md5(doc):
+    hash = hashlib.md5()
+    hash.update(str(doc))
+    return hash.hexdigest()
+
+
+def get_module():
+    for fi in files:
+        fi_lib = importlib.import_module(fi)
+        doc_function = getattr(fi_lib, "__all__")
+        for api in doc_function:
+            api_name = fi + "." + api
+            try:
+                doc_module = getattr(eval(api_name), "__doc__")
+            except:
+                pass
+            doc_md5_code = md5(doc_module)
+            doc_dict[api_name] = doc_md5_code
+
+
+def doc_md5_dict(doc_md5_path):
+    with open(doc_md5_path, "rb") as f:
+        doc_md5 = f.read()
+        doc_md5_dict = ast.literal_eval(doc_md5)
+    return doc_md5_dict
+
+
+def check_doc_md5():
+    for k, v in doc_dict.items():
+        try:
+            if doc_ci_dict[k] != v:
+                return doc_dict
+        except:
+            return doc_dict
+    return True
+
+
+if __name__ == "__main__":
+    doc_dict = {}
+    doc_ci_dict = {}
+    doc_md5_file = "/root/.cache/doc_md5.txt"
+    if not os.path.exists(doc_md5_file):
+        os.mknod(doc_md5_file)
+    else:
+        doc_ci_dict = doc_md5_dict(doc_md5_file)
+    get_module()
+    if not os.path.getsize(doc_md5_file):
+        with open(doc_md5_file, 'w') as f:
+            f.write(str(doc_dict))
+        check_dic = True
+        print(check_dic)
+    else:
+        check_dic = check_doc_md5()
+        print(check_dic)