diff --git a/CMakeLists.txt b/CMakeLists.txt
index a51552d96a..9ec632e206 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,9 +33,7 @@ if(WIN32)
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
 endif(WIN32)
 
-if(NOT CMAKE_CROSSCOMPILING)
-    find_package(CUDA QUIET)
-endif(NOT CMAKE_CROSSCOMPILING)
+find_package(CUDA QUIET)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
 
@@ -49,7 +47,6 @@ option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FO
 option(WITH_NGRAPH      "Compile PaddlePaddle with nGraph support."     OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
-option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
@@ -60,11 +57,9 @@ option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
-option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
-option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
@@ -96,37 +91,6 @@ if(NOT CMAKE_BUILD_TYPE)
       FORCE)
 endif()
 
-if(ANDROID OR IOS)
-    if(ANDROID)
-        if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
-            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-        endif()
-    endif()
-
-    set(WITH_GPU OFF CACHE STRING
-        "Disable GPU when cross-compiling for Android and iOS" FORCE)
-    set(WITH_AVX OFF CACHE STRING
-        "Disable AVX when cross-compiling for Android and iOS" FORCE)
-    set(WITH_PYTHON OFF CACHE STRING
-        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
-    set(WITH_RDMA OFF CACHE STRING
-        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKL OFF CACHE STRING
-        "Disable MKL when cross-compiling for Android and iOS" FORCE)
-    set(WITH_NGRAPH OFF CACHE STRING
-        "Disable nGraph when cross-compiling for Android and iOS" FORCE)
-    set(WITH_GOLANG OFF CACHE STRING
-        "Disable golang when cross-compiling for Android and iOS" FORCE)
-
-    # Compile PaddlePaddle mobile inference library
-    if (NOT WITH_C_API)
-        set(WITH_C_API ON CACHE STRING
-            "Always compile the C_API when cross-compiling for Android and iOS" FORCE)
-    endif()
-    set(MOBILE_INFERENCE ON)
-    add_definitions(-DPADDLE_MOBILE_INFERENCE)
-endif()
-
 if (APPLE)
     set(WITH_MKL OFF CACHE STRING
         "Disable MKL for building on mac" FORCE)
@@ -135,8 +99,6 @@ endif()
 if (WIN32)
     set(WITH_DISTRIBUTE OFF CACHE STRING
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
-    set(WITH_C_API OFF CACHE STRING
-            "Disable C_API when compiling for Windows" FORCE)
     set(WITH_FLUID_ONLY ON CACHE STRING
             "Enable FLUID_ONLY when compiling for Windows" FORCE)
 endif()
@@ -150,21 +112,7 @@ set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
 set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
   "A path setting fluid inference shared and static libraries")
 
-if (WITH_C_API AND WITH_PYTHON)
-  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
-    "when using C-API. It will give an unpredictable behavior when using a "
-    "different Python interpreter from compiling.")
-endif()
-
-if (WITH_C_API)
-  set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
-endif()
-
-if(MOBILE_INFERENCE)
-    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
-else()
-    set(THIRD_PARTY_BUILD_TYPE Release)
-endif()
+set(THIRD_PARTY_BUILD_TYPE Release)
 
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
@@ -193,7 +141,6 @@ include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/ngraph)    # download, build, install nGraph
-include(external/swig)      # download, build, install swig
 include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
@@ -312,11 +259,6 @@ if(WITH_MKLDNN)
     list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 
-if(USE_NNPACK)
-    include(external/nnpack)
-    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
-endif(USE_NNPACK)
-
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/Dockerfile.android b/Dockerfile.android
deleted file mode 100644
index 48db2efea2..0000000000
--- a/Dockerfile.android
+++ /dev/null
@@ -1,42 +0,0 @@
-FROM ubuntu:16.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-# ENV variables
-ARG ANDROID_ABI
-ARG ANDROID_API
-
-ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
-ENV ANDROID_API=${ANDROID_API:-21}
-
-ENV HOME=/root \
-    ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_TOOLCHAINS_DIR=/opt/toolchains
-
-RUN apt-get update && \
-    apt-get install -y \
-    git python-dev python-pip python-numpy \
-    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
-    apt-get clean -y
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-RUN pip install --upgrade pip==9.0.3 && \
-    pip install -U 'protobuf==3.1.0' && \
-    pip install -U wheel sphinx && \
-    pip install pre-commit
-
-# Android NDK
-RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
-    mkdir -p /opt/android-ndk-tmp && \
-    cd /opt/android-ndk-tmp && \
-    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
-    unzip -q android-ndk-r14b-linux-x86_64.zip && \
-    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    rm -rf /opt/android-ndk-tmp
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 24de8d9d7c..52ac31d1d1 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -64,24 +64,18 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
   "Folder contains reference-cblas")
-if(NOT CMAKE_CROSSCOMPILING)
-  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-    ${REFERENCE_CBLAS_ROOT}/include
-    /usr/include
-    /usr/include/cblas
-  )
+set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+  ${REFERENCE_CBLAS_ROOT}/include
+  /usr/include
+  /usr/include/cblas
+)
 
-  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-    ${REFERENCE_CBLAS_ROOT}/lib
-    /usr/lib
-    /usr/lib/blas/reference/
-    /usr/lib/reference/
-  )
-else()
-  # Disable the finding of reference cblas under host's system path
-  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
-  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
-endif()
+set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
+  ${REFERENCE_CBLAS_ROOT}/lib
+  /usr/lib
+  /usr/lib/blas/reference/
+  /usr/lib/reference/
+)
 
 if(WITH_SYSTEM_BLAS)
   find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
@@ -98,10 +92,3 @@ if(WITH_SYSTEM_BLAS)
     message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   endif()
 endif()
-
-if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER vecLib)
-  set(CBLAS_INC_DIR ${VECLIB_INC_DIR})
-  add_definitions(-DPADDLE_USE_VECLIB)
-endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e3d856fb30..076e839120 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -49,12 +49,10 @@ if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
 
-if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX_FOUND)
-        set(SIMD_FLAG ${AVX_FLAG})
-    elseif(SSE3_FOUND)
-        set(SIMD_FLAG ${SSE3_FLAG})
-    endif()
+if(WITH_AVX AND AVX_FOUND)
+    set(SIMD_FLAG ${AVX_FLAG})
+elseif(SSE3_FOUND)
+    set(SIMD_FLAG ${SSE3_FLAG})
 endif()
 
 if(WIN32)
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
deleted file mode 100644
index 4cf2be3bdf..0000000000
--- a/cmake/cross_compiling/android.cmake
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This is a toolchain file for cross-compiling for Android, and the
-# configuration refers to the open-source resposity:
-#     https://github.com/taka-no-me/android-cmake
-# Most of the variables are compatible with that used in
-#     https://developer.android.com/ndk/guides/cmake.html
-# The supported variables are listed belows:
-# 
-# ANDROID_STANDALONE_TOOLCHAIN
-# ANDROID_TOOLCHAIN
-# ANDROID_ABI
-# ANDROID_NATIVE_API_LEVEL
-# ANDROID_ARM_MODE
-# ANDROID_ARM_NEON
-#
-# For CMake >= 3.7.0, all the settings will be delivered to CMake system
-# variables to let CMake do the cross-compiling configurations itself.
-# More detail of cross-compiling settings
-#     https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html
-
-IF(NOT ANDROID)
-    return()
-ENDIF()
-
-# check the exist of android standalone toolchain
-IF(NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN)
-    SET(ANDROID_STANDALONE_TOOLCHAIN $ENV{ANDROID_STANDALONE_TOOLCHAIN}
-        CACHE PATH "Folder holds the standalone toolchain of Android NDK")
-ENDIF()
-IF(NOT ANDROID_STANDALONE_TOOLCHAIN)
-    MESSAGE(WARNING "It is recommended to set ANDROID_STANDALONE_TOOLCHAIN to "
-            "use a standalone toolchain.\n"
-            "To cross-compile for Android, you need to:\n"
-            "1. Download an Android NDK from"
-            " https://developer.android.com/ndk/downloads/index.html\n"
-            "2. Setup a standalone toolchain"
-            "https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn\n")
-ENDIF()
-
-IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
-    IF(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$")
-        STRING(REPLACE "android-" "" CMAKE_SYSTEM_VERSION "${CMAKE_MATCH_0}")
-    ELSEIF(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$")
-        SET(CMAKE_SYSTEM_VERSION ${ANDROID_NATIVE_API_LEVEL})
-    ENDIF()
-ENDIF()
-
-IF(NOT DEFINED ANDROID_TOOLCHAIN)
-    SET(ANDROID_TOOLCHAIN clang)
-ENDIF()
-
-IF(NOT DEFINED ANDROID_ABI)
-    SET(ANDROID_ABI "armeabi-v7a")
-ENDIF()
-
-IF(NOT DEFINED ANDROID_ARM_MODE)
-    SET(ANDROID_ARM_MODE ON)
-ENDIF()
-IF(ANDROID_ARM_MODE)
-    SET(ANDROID_ARM_MODE_NAME "arm")
-ELSE(ANDROID_ARM_MODE)
-    SET(ANDROID_ARM_MODE_NAME "thumb")
-ENDIF(ANDROID_ARM_MODE)
-
-IF(NOT DEFINED ANDROID_ARM_NEON)
-    SET(ANDROID_ARM_NEON ON)
-ENDIF()
-
-IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
-    IF("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
-        SET(CMAKE_SYSTEM_NAME "Linux")
-    ENDIF()
-    MESSAGE(WARNING "It is recommended to use CMake >= 3.7.0 (current version: "
-            "${CMAKE_VERSION}), when cross-compiling for Android.")
-
-    IF(ANDROID_STANDALONE_TOOLCHAIN)
-        # Use standalone toolchain
-        SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
-
-        IF(NOT CMAKE_SYSTEM_VERSION)
-            SET(ANDROID_STANDALONE_TOOLCHAIN_API "")
-            SET(ANDROID_API_LEVEL_H_REGEX "^[\t ]*#[\t ]*define[\t ]+__ANDROID_API__[\t ]+([0-9]+)")
-            FILE(STRINGS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h"
-                ANDROID_API_LEVEL_H_CONTENT REGEX "${ANDROID_API_LEVEL_H_REGEX}")
-            IF(ANDROID_API_LEVEL_H_CONTENT MATCHES "${ANDROID_API_LEVEL_H_REGEX}")
-                SET(ANDROID_STANDALONE_TOOLCHAIN_API "${CMAKE_MATCH_1}")
-            ENDIF()
-            SET(CMAKE_SYSTEM_VERSION ${ANDROID_STANDALONE_TOOLCHAIN_API})
-        ENDIF()
-
-        # Toolchain
-        SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
-    ELSE(ANDROID_NDK)
-        # TODO: use android ndk
-    ENDIF()
-
-    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-        SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
-        IF(ANDROID_ABI STREQUAL "armeabi")
-            SET(CMAKE_SYSTEM_PROCESSOR armv5te)
-            SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
-        ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-            SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
-            SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
-        ENDIF()
-    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-        SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
-        SET(CMAKE_SYSTEM_PROCESSOR aarch64)
-        SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
-    ENDIF()
-    SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
-
-    IF(ANDROID_TOOLCHAIN STREQUAL clang)
-        SET(ANDROID_C_COMPILER_NAME clang)
-        SET(ANDROID_CXX_COMPILER_NAME clang++)
-        SET(CMAKE_C_COMPILER_TARGET   ${ANDROID_CLANG_TRIPLE})
-        SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
-    ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
-        SET(ANDROID_C_COMPILER_NAME gcc)
-        SET(ANDROID_CXX_COMPILER_NAME g++)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
-    ENDIF()
-
-    # C compiler
-    IF(NOT CMAKE_C_COMPILER)
-        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
-    ELSE()
-        GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
-    ENDIF()
-    IF(NOT EXISTS ${ANDROID_C_COMPILER})
-        MESSAGE(FATAL_ERROR "Cannot find C compiler: ${ANDROID_C_COMPILER}")
-    ENDIF()
-
-    # CXX compiler
-    IF(NOT CMAKE_CXX_COMPILER)
-        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
-    ELSE()
-        GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
-    ENDIF()
-    IF(NOT EXISTS ${ANDROID_CXX_COMPILER})
-        MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${ANDROID_CXX_COMPILER}")
-    ENDIF()
-
-    SET(CMAKE_C_COMPILER ${ANDROID_C_COMPILER} CACHE PATH "C compiler" FORCE)
-    SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
-
-    # Toolchain and ABI specific flags.
-    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
-    SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
-
-    IF(ANDROID_ABI STREQUAL "armeabi")
-        LIST(APPEND ANDROID_COMPILER_FLAGS
-             -march=armv5te
-             -mtune=xscale
-             -msoft-float)
-    ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-        LIST(APPEND ANDROID_COMPILER_FLAGS
-             -march=armv7-a
-             -mfloat-abi=softfp)
-        IF(ANDROID_ARM_NEON)
-            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=neon)
-        ELSE()
-            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
-        ENDIF()
-        LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
-    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
-    ENDIF()
-
-    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-        IF(ANDROID_ARM_MODE)
-            LIST(APPEND ANDROID_COMPILER_FLAGS -marm)
-        ELSE()
-            LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
-        ENDIF()
-        IF(ANDROID_TOOLCHAIN STREQUAL clang)
-            # Disable integrated-as for better compatibility.
-            LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
-        ENDIF()
-    ENDIF()
-
-    IF(ANDROID_TOOLCHAIN STREQUAL clang)
-        # CMake automatically forwards all compiler flags to the linker,
-        # and clang doesn't like having -Wa flags being used for linking.
-        # To prevent CMake from doing this would require meddling with
-        # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
-        LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
-    ENDIF()
-
-    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
-    STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
-
-    SET(CMAKE_C_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}"
-        CACHE STRING "C flags")
-    SET(CMAKE_CXX_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}"
-        CACHE STRING "CXX flags")
-    SET(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}"
-        CACHE STRING "shared linker flags")
-
-    SET(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
-    SET(CMAKE_EXE_LINKER_FLAGS "-pie -fPIE ${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}"
-        CACHE STRING "executable linker flags")
-
-    MESSAGE(STATUS "Android: Targeting API '${CMAKE_SYSTEM_VERSION}' "
-            "with architecture '${ANDROID_ARM_MODE_NAME}', "
-            "ABI '${ANDROID_ABI}', and processor '${CMAKE_SYSTEM_PROCESSOR}'")
-    MESSAGE(STATUS "System CMAKE_C_FLAGS: " ${CMAKE_C_FLAGS})
-    MESSAGE(STATUS "System CMAKE_CXX_FLAGS: " ${CMAKE_CXX_FLAGS})
-ELSE()
-    IF(ANDROID_STANDALONE_TOOLCHAIN)
-        SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
-    ENDIF()
-    SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
-            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
-        ENDIF()
-    ENDIF()
-ENDIF()
diff --git a/cmake/cross_compiling/host.cmake b/cmake/cross_compiling/host.cmake
deleted file mode 100644
index f9c6b12136..0000000000
--- a/cmake/cross_compiling/host.cmake
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# find host C compiler
-IF(HOST_C_COMPILER)
-    SET(HOST_C_COMPILER_NAME ${HOST_C_COMPILER})
-ELSEIF(NOT $ENV{CC} STREQUAL "")
-    SET(HOST_C_COMPILER_NAME $ENV{CC})
-ELSE()
-    SET(HOST_C_COMPILER_NAME cc)
-ENDIF()
-
-GET_FILENAME_COMPONENT(HOST_C_COMPILER_PATH ${HOST_C_COMPILER_NAME} PROGRAM)
-IF(NOT HOST_C_COMPILER_PATH OR NOT EXISTS ${HOST_C_COMPILER_PATH})
-    MESSAGE(FATAL_ERROR "Cannot find host C compiler, set host C compiler:\n"
-            "\tcmake .. -DHOST_C_COMPILER=...")
-ENDIF()
-
-# find host CXX compiler
-IF(HOST_CXX_COMPILER)
-    SET(HOST_CXX_COMPILER_NAME ${HOST_CXX_COMPILER})
-ELSEIF(NOT $ENV{CXX} STREQUAL "")
-    SET(HOST_CXX_COMPILER_NAME $ENV{CXX})
-ELSE()
-    SET(HOST_CXX_COMPILER_NAME c++)
-ENDIF()
-
-GET_FILENAME_COMPONENT(HOST_CXX_COMPILER_PATH ${HOST_CXX_COMPILER_NAME} PROGRAM)
-IF(NOT HOST_CXX_COMPILER_PATH OR NOT EXISTS ${HOST_CXX_COMPILER_PATH})
-    MESSAGE(FATAL_ERROR "Cannot find host CXX compiler, set host CXX compiler:\n"
-            "\tcmake .. -DHOST_CXX_COMPILER=...")
-ENDIF()
-
-SET(HOST_C_COMPILER ${HOST_C_COMPILER_PATH} CACHE PATH "Host C compiler")
-SET(HOST_CXX_COMPILER ${HOST_CXX_COMPILER_PATH} CACHE PATH "Host CXX compiler")
-
-MESSAGE(STATUS "Found host C compiler: " ${HOST_C_COMPILER})
-MESSAGE(STATUS "Found host CXX compiler: " ${HOST_CXX_COMPILER})
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
deleted file mode 100644
index 10d389ec8e..0000000000
--- a/cmake/cross_compiling/ios.cmake
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This is a toolchain file for cross-compiling for iOS, and the
-# configuration largely refers to public toolchain file:
-#    https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake
-# and
-#    https://github.com/cristeab/ios-cmake
-#
-# Supports options:
-# IOS_PLATFORM = OS (default) or SIMULATOR
-#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
-#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
-#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
-# IOS_ARCH
-#   The archectures wanted to support, such "arm64", "armv7;arm64"
-# IOS_DEPLOYMENT_TARGET
-#   The minimum iOS deployment version, such as "7.0"
-# IOS_ENABLE_BITCODE = ON (default) or OFF
-# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON
-# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
-#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
-#   If set manually, it will override the default location and force the user of a particular Developer Platform
-# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
-#   By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value.
-#   In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path.
-#   If set manually, this will force the use of a specific SDK version
-
-# Macros:
-# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
-#  A convenience macro for setting xcode specific properties on targets
-#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
-# find_host_package (PROGRAM ARGS)
-#  A macro used to find executable programs on the host system, not within the iOS environment.
-#  Thanks to the android-cmake project for providing the command
-
-if(NOT IOS)
-  return()
-endif()
-
-set(CMAKE_SYSTEM_NAME Darwin)
-
-# Get the Xcode version being used.
-execute_process(COMMAND xcodebuild -version
-                OUTPUT_VARIABLE XCODE_VERSION
-                RESULT_VARIABLE XCODE_VERSION_RESULT
-                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT ${XCODE_VERSION_RESULT})
-  string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
-  string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
-  message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
-else()
-  message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.")
-endif()
-
-# Required as of cmake 2.8.10
-set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
-
-# Setup iOS platform unless specified manually with IOS_PLATFORM
-if(NOT DEFINED IOS_PLATFORM)
-  set(IOS_PLATFORM "OS")
-endif()
-set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
-
-# Set the architecture for iOS
-if(NOT DEFINED IOS_ARCH)
-  if(IOS_PLATFORM STREQUAL "OS")
-    set(IOS_ARCH "armv7;armv7s;arm64")
-  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    set(IOS_ARCH "i386;x86_64")
-  endif()
-endif()
-set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
-
-# Specify minimum iOS deployment version
-if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
-  set(IOS_DEPLOYMENT_TARGET "7.0")
-endif()
-set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
-
-# Whether to enable bitcode
-if(NOT DEFINED IOS_ENABLE_BITCODE)
-  set(IOS_ENABLE_BITCODE ON)
-endif()
-set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode")
-
-if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS)
-  set(IOS_USE_VECLIB_FOR_BLAS OFF)
-endif()
-set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib")
-
-# Check the platform selection and setup for developer root
-if(${IOS_PLATFORM} STREQUAL "OS")
-  set(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
-  set(XCODE_IOS_PLATFORM iphoneos)
-
-  # This causes the installers to properly locate the output libraries
-  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
-elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR")
-  set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
-  set(XCODE_IOS_PLATFORM iphonesimulator)
-
-  # This causes the installers to properly locate the output libraries
-  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
-elseif(${IOS_PLATFORM} STREQUAL "WATCHOS")
-  set(IOS_PLATFORM_LOCATION "WatchOS.platform")
-  set(XCODE_IOS_PLATFORM watchos)
-
-  # This causes the installers to properly locate the output libraries
-  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos")
-else(${IOS_PLATFORM} STREQUAL "OS")
-  message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n"
-          "\t OS, SIMULATOR, or WATCHOS.")
-endif()
-
-# Check iOS developer toolchain
-if(NOT DEFINED IOS_DEVELOPER_ROOT)
-  # Setup iOS developer location
-  execute_process(COMMAND xcode-select -print-path
-                  OUTPUT_VARIABLE XCODE_DEVELOPER_DIR
-                  RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  # Xcode 4.3 changed the installation location, choose the most recent one available
-  if(${XCODE_VERSION} VERSION_LESS "4.3.0")
-    set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-  else()
-    set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-  endif()
-endif()
-if(EXISTS ${IOS_DEVELOPER_ROOT})
-  set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
-else()
-  message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.")
-endif()
-
-# Check iOS SDK
-if(NOT DEFINED IOS_SDK_ROOT)
-  # Find and use the most recent iOS sdk
-  file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*")
-  if(IOS_SDK_LISTS)
-    list(SORT IOS_SDK_LISTS)
-    list(REVERSE IOS_SDK_LISTS)
-    list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT)
-  else(IOS_SDK_LISTS)
-    message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}."
-            " Please manually set IOS_SDK_ROOT or install the iOS SDK.")
-  endif(IOS_SDK_LISTS)
-endif()
-if(EXISTS ${IOS_SDK_ROOT})
-  set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
-  message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}")
-else()
-  message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.")
-endif()
-
-# Set the sysroot default to the most recent SDK
-set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
-
-# Get version of iOS SDK
-execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
-                OUTPUT_VARIABLE IOS_SDK_VERSION
-                RESULT_VARIABLE IOS_SDK_VERSION_RESULT
-                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(${IOS_SDK_VERSION_RESULT})
-  string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}")
-endif()
-if(NOT IOS_SDK_VERSION)
-  message(WARNING "Cannot get SDK's version.")
-  set(IOS_SDK_VERSION 1)
-endif()
-set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION})
-
-# Find the C & C++ compilers for the specified SDK.
-if(NOT CMAKE_C_COMPILER)
-  # Default to use clang
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
-                  OUTPUT_VARIABLE IOS_C_COMPILER
-                  RESULT_VARIABLE IOS_C_COMPILER_RESULT
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(${IOS_C_COMPILER_RESULT})
-    get_filename_component(IOS_C_COMPILER clang PROGRAM)
-  endif()
-else(NOT CMAKE_C_COMPILER)
-  # User can set it in cmake command
-  get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
-endif(NOT CMAKE_C_COMPILER)
-if(NOT EXISTS ${IOS_C_COMPILER})
-  message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}")
-endif()
-
-if(NOT CMAKE_CXX_COMPILER)
-  # Default to use clang++
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
-                  OUTPUT_VARIABLE IOS_CXX_COMPILER
-                  RESULT_VARIABLE IOS_CXX_COMPILER_RESULT
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(${IOS_CXX_COMPILER_RESULT})
-    get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM)
-  endif()
-else(NOT CMAKE_CXX_COMPILER)
-  # User can set it in cmake command
-  get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
-endif(NOT CMAKE_CXX_COMPILER)
-if(NOT EXISTS ${IOS_CXX_COMPILER})
-  message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}")
-endif()
-
-set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE)
-set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
-
-set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
-set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
-set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
-set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
-
-# Set iOS specific C/C++ flags
-if(IOS_PLATFORM STREQUAL "OS")
-  if(XCODE_VERSION VERSION_LESS "7.0")
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
-  else()
-    # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
-  endif()
-else()
-  set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
-endif()
-
-if(IOS_ENABLE_BITCODE)
-  set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode")
-else()
-  set(XCODE_IOS_BITCODE_FLAGS "")
-endif()
-
-set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}")
-
-# Hidden visibilty is required for cxx on iOS 
-set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
-
-set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
-
-if(IOS_USE_VECLIB_FOR_BLAS)
-  # Find vecLib for iOS
-  set(VECLIB_SEARCH_DIRS
-      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks
-      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks
-      )
-  find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers)
-
-  include(FindPackageHandleStandardArgs)
-  find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR)
-
-  if(VECLIB_FOUND)
-    if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
-      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib")
-      message(STATUS "Found standalone vecLib.framework")
-    else()
-      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate")
-      message(STATUS "Found vecLib as part of Accelerate.framework")
-    endif()
-
-  endif()
-endif()
-
-set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}")
-set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
-
-set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
-if(NOT IOS_ENABLE_BITCODE)
-  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
-  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
-else()
-  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib")
-  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle")
-endif()
-set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
-
-# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
-# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
-# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
-# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
-if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
-endif()
-
-# Set the find root to the iOS developer roots and to user defined paths
-set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH}
-    CACHE string  "iOS find search path root")
-
-# default to searching for frameworks first
-set(CMAKE_FIND_FRAMEWORK FIRST)
-
-# set up the default search directories for frameworks
-set(CMAKE_SYSTEM_FRAMEWORK_PATH
-    ${IOS_SDK_ROOT}/System/Library/Frameworks
-    ${IOS_SDK_ROOT}/System/Library/PrivateFrameworks
-    ${IOS_SDK_ROOT}/Developer/Library/Frameworks
-    )
-
-# only search the iOS sdks, not the remainder of the host filesystem
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', "
-        "building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'")
-message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
-message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
-
-# Used in ExternalProject command
-string(REPLACE ";" "\\$<SEMICOLON>" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-set(EXTERNAL_OPTIONAL_ARGS
-    -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
-    -DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES})
-
-# This little macro lets you set any XCode specific property
-macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
-  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
-endmacro(set_xcode_property)
-
-# This macro lets you find executable programs on the host system
-macro(find_host_package)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
-  set(IOS FALSE)
-
-  find_package(${ARGN})
-
-  set(IOS TRUE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-endmacro(find_host_package)
diff --git a/cmake/cross_compiling/raspberry_pi.cmake b/cmake/cross_compiling/raspberry_pi.cmake
deleted file mode 100644
index 0425b2ae15..0000000000
--- a/cmake/cross_compiling/raspberry_pi.cmake
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This is a toolchain file for cross-compiling for Raspberry Pi.
-# The supported variables are listed belows:
-#
-# RPI_TOOLCHAIN
-# RPI_ARM_NEON
-#
-# Also you can set CMAKE_C/CXX_COMPILER yourself, through cmake arguments.
-
-IF(NOT RPI)
-    return()
-ENDIF()
- 
-SET(CMAKE_SYSTEM_NAME Linux)
-SET(CMAKE_SYSTEM_VERSION 1)
-SET(CMAKE_SYSTEM_PROCESSOR arm)
-
-# check the exist of raspberry pi toolchain
-IF(NOT DEFINED RPI_TOOLCHAIN)
-    SET(RPI_TOOLCHAIN $ENV{RPI_TOOLCHAIN}
-        CACHE PATH "Folder holds the toolchain of Raspberr Pi")
-ENDIF()
-IF(NOT RPI_TOOLCHAIN)
-    MESSAGE(WARNING "It is recommended to set RPI_TOOLCHAIN to use toolchain.\n"
-            "To cross-compile for Raspberry Pi, you need to download the tools using:\n"
-            " git clone https://github.com/raspberrypi/tools\n")
-ENDIF()
-
-IF(NOT DEFINED RPI_ARM_NEON)
-    SET(RPI_ARM_NEON ON)
-ENDIF()
-
-IF(RPI_TOOLCHAIN)
-    SET(RPI_TOOLCHAIN_ROOT ${RPI_TOOLCHAIN})
-    IF(RPI_TOOLCHAIN_ROOT MATCHES "gcc-linaro-arm-linux-gnueabihf-raspbian(-x64)?$")
-        # gcc-linaro-arm-linux-gnueabihf-raspbian
-        # gcc-linaro-arm-linux-gnueabihf-raspbian-x64
-        SET(RPI_TOOLCHAIN_NAME arm-linux-gnueabihf)
-    ENDIF()
-    SET(RPI_TOOLCHAIN_PREFIX "${RPI_TOOLCHAIN_ROOT}/bin/${RPI_TOOLCHAIN_NAME}-")
-ENDIF()
-
-# C compiler
-IF(NOT CMAKE_C_COMPILER)
-    SET(RPI_C_COMPILER "${RPI_TOOLCHAIN_PREFIX}gcc")
-ELSE()
-    GET_FILENAME_COMPONENT(RPI_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
-ENDIF()
-IF(NOT EXISTS ${RPI_C_COMPILER})
-    MESSAGE(FATAL_ERROR "Cannot find C compiler: ${RPI_C_COMPILER}")
-ENDIF()
-
-# CXX compiler
-IF(NOT CMAKE_CXX_COMPILER)
-    SET(RPI_CXX_COMPILER "${RPI_TOOLCHAIN_PREFIX}g++")
-ELSE()
-    GET_FILENAME_COMPONENT(RPI_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
-ENDIF()
-IF(NOT EXISTS ${RPI_CXX_COMPILER})
-    MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${RPI_CXX_COMPILER}")
-ENDIF()
-
-SET(CMAKE_C_COMPILER ${RPI_C_COMPILER} CACHE PATH "C compiler" FORCE)
-SET(CMAKE_CXX_COMPILER ${RPI_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
-
-IF(RPI_ARM_NEON)
-    SET(RPI_C_FLAGS "${RPI_C_FLAGS} -mfpu=neon")
-ENDIF()
-
-SET(CMAKE_C_FLAGS "${RPI_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-SET(CMAKE_CXX_FLAGS "${RPI_C_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 16432ce2b8..ea46f6418e 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -63,9 +63,7 @@ function(select_nvcc_arch_flags out_variable)
   # List of arch names
   set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
   set(archs_name_default "All")
-  if(NOT CMAKE_CROSSCOMPILING)
-    list(APPEND archs_names "Auto")
-  endif()
+  list(APPEND archs_names "Auto")
 
   # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
   set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
index a743b572a6..52507a6ae4 100644
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
 
-IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+IF(NOT WITH_DISTRIBUTE)
     return()
 ENDIF()
 
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 95ca16f57f..f3ca74faea 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -71,13 +71,3 @@ if (WIN32)
     set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
   endif(HAVE_SHLWAPI)
 endif (WIN32)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
-  IF(ANDROID)
-    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
-  ENDIF()
-ENDIF()
-
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 8cd0455c16..7a6a452388 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -26,14 +26,8 @@ ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 
-IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-  # Using the unofficial glog for Android API < 21
-  SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
-  SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
-ELSE()
-  SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
-  SET(GLOG_TAG "v0.3.5")
-ENDIF()
+SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
+SET(GLOG_TAG "v0.3.5")
 
 ExternalProject_Add(
     extern_glog
@@ -78,12 +72,3 @@ ADD_DEPENDENCIES(glog extern_glog gflags)
 LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
-  IF(ANDROID)
-    INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib)
-  ENDIF()
-ENDIF()
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index fd9835d023..c5754da59b 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
 
-IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+IF(NOT WITH_DISTRIBUTE)
     return()
 ENDIF()
 
diff --git a/cmake/external/gzstream.cmake b/cmake/external/gzstream.cmake
index 3e36ef7ae2..af7a8bfda6 100644
--- a/cmake/external/gzstream.cmake
+++ b/cmake/external/gzstream.cmake
@@ -13,10 +13,6 @@
 # limitations under the License.
 #
 
-IF(MOBILE_INFERENCE)
-    return()
-ENDIF()
-
 include (ExternalProject)
 
 # NOTE: gzstream is needed when linking with ctr reader.
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
index 530f7ebe28..39f49d210a 100644
--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@@ -19,8 +19,8 @@ IF(NOT WITH_LIBXSMM)
     return()
 ENDIF()
 
-IF(WIN32 OR APPLE OR ANDROID OR IOS)
-    MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet.")
     SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
     return()
 ENDIF()
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 03f0dee859..6a7be73f09 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -110,7 +110,3 @@ else(WIN32)
 endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
 ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
-IF(WITH_C_API)
-  INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
-ENDIF()
-
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 43322a257a..2caff27357 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -74,7 +74,3 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
 LIST(APPEND external_project_dependencies mklml)
-
-IF(WITH_C_API)
-  INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
-ENDIF()
diff --git a/cmake/external/nnpack.cmake b/cmake/external/nnpack.cmake
deleted file mode 100644
index d42bcb0f32..0000000000
--- a/cmake/external/nnpack.cmake
+++ /dev/null
@@ -1,30 +0,0 @@
-# Find the NNPACK library
-#  NNPACK_ROOT - where to find NNPACK include and library.
-#
-
-set(NNPACK_FOUND OFF)
-set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
-find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
-find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
-find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
-find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
-find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
-
-if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
-  set(NNPACK_FOUND ON)
-  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
-
-  set(NNPACK_LIBS)
-  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
-  if (NNPACK_UKERNELS_LIB)
-    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
-  endif()
-  if (NNPACK_CPUFEATURES_LIB)
-    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
-  endif()
-  if(NOT ANDROID)
-    list(APPEND NNPACK_LIBS "rt")
-  endif()
-else()
-  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
-endif()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index aeb976b840..b347a59292 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -40,38 +40,12 @@ IF(NOT ${CBLAS_FOUND})
     SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
     SET(OPENBLAS_COMMIT "v0.2.20")
 
-    IF(CMAKE_CROSSCOMPILING)
-        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
-        GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
-        SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
-        IF(ANDROID)
-            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                # use softfp
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
-            ENDIF()
-        ELSEIF(IOS)
-            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
-            ELSE()
-                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
-                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
-            ENDIF()
-        ELSEIF(RPI)
-            # use hardfp
-            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
-        ENDIF()
-    ELSE()
-        IF(APPLE)
-            SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
-        ENDIF()
-        SET(OPTIONAL_ARGS "")
-        IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
-            SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
-        ENDIF()
+    IF(APPLE)
+        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+    ENDIF()
+    SET(OPTIONAL_ARGS "")
+    IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+        SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
     ENDIF()
 
     SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
@@ -92,25 +66,6 @@ IF(NOT ${CBLAS_FOUND})
     ELSE()
     ENDIF(NOT WIN32)
     SET(CBLAS_PROVIDER openblas)
-    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
-        # Because libopenblas.a is a symbolic link of another library, thus need to
-        # install the whole directory.
-        IF(ANDROID)
-            SET(TMP_INSTALL_DIR third_party/openblas/lib/${ANDROID_ABI})
-        ELSE()
-            SET(TMP_INSTALL_DIR third_party/openblas/lib)
-        ENDIF()
-        INSTALL(CODE "execute_process(
-            COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
-                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
-            )"
-        )
-        INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
-                \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
-            )"
-        )
-    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})
 
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index e1e619e572..e05b7694dd 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -204,15 +204,6 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
 
     SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
     SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
-    IF(MOBILE_INFERENCE)
-        # The reason why the official version is not used is described in
-        # https://github.com/PaddlePaddle/Paddle/issues/6114
-        SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
-        SET(PROTOBUF_TAG "v3.2.0")
-        IF(NOT BUILD_FOR_HOST)
-            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
-        ENDIF()
-    ENDIF()
 
     ExternalProject_Add(
         ${TARGET_NAME}
@@ -240,19 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-IF(NOT MOBILE_INFERENCE)
-    SET(PROTOBUF_VERSION 3.1)
-ELSE()
-    SET(PROTOBUF_VERSION 3.2)
-ENDIF()
-IF(CMAKE_CROSSCOMPILING)
-    build_protobuf(protobuf_host TRUE)
-    LIST(APPEND external_project_dependencies protobuf_host)
-
-    SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE}
-        CACHE FILEPATH "protobuf executable." FORCE)
-ENDIF()
-
+SET(PROTOBUF_VERSION 3.1)
 
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
@@ -266,20 +245,7 @@ IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
         CACHE FILEPATH "protoc library." FORCE)
 
-    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
-        IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
-        ELSE()
-            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
-        ENDIF()
-    ENDIF()
-
-    IF(CMAKE_CROSSCOMPILING)
-        PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
-    ELSE()
-        SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
-            CACHE FILEPATH "protobuf executable." FORCE)
-        PROMPT_PROTOBUF_LIB(extern_protobuf)
-    ENDIF()
+    SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
+        CACHE FILEPATH "protobuf executable." FORCE)
+    PROMPT_PROTOBUF_LIB(extern_protobuf)
 ENDIF(NOT PROTOBUF_FOUND)
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
index 3b495d78e2..b4ea268e5a 100644
--- a/cmake/external/pslib.cmake
+++ b/cmake/external/pslib.cmake
@@ -71,7 +71,3 @@ ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
 ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
 LIST(APPEND external_project_dependencies pslib)
-
-IF(WITH_C_API)
-  INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib)
-ENDIF()
diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
index 7ff5a8aca1..8b43f2ef5c 100644
--- a/cmake/external/pslib_brpc.cmake
+++ b/cmake/external/pslib_brpc.cmake
@@ -71,7 +71,3 @@ ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
 ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
 LIST(APPEND external_project_dependencies pslib_brpc)
-
-IF(WITH_C_API)
-  INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib)
-ENDIF()
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index f9d4cd9740..27d075336d 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if(MOBILE_INFERENCE OR RPI)
-    return()
-endif()
-
 include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 1ec79462c1..392f186b7c 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-IF(MOBILE_INFERENCE OR RPI)
-    return()
-ENDIF()
-
 include (ExternalProject)
 
 set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
deleted file mode 100644
index de07703695..0000000000
--- a/cmake/external/swig.cmake
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT WITH_SWIG_PY)
-    return()
-ENDIF()
-
-FIND_PACKAGE(SWIG)
-
-IF(NOT SWIG_FOUND)
-    # build swig as an external project
-    INCLUDE(ExternalProject)
-
-    SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig)
-    SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig)
-    SET(SWIG_TARGET_VERSION "3.0.2")
-    SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
-    SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
-
-    IF(WIN32)
-        # swig.exe available as pre-built binary on Windows:
-        ExternalProject_Add(swig
-            URL                 http://prdownloads.sourceforge.net/swig/swigwin-${SWIG_TARGET_VERSION}.zip
-            URL_MD5             ${SWIG_DOWNLOAD_WIN_MD5}
-            SOURCE_DIR          ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   ""
-            BUILD_COMMAND       ""
-            INSTALL_COMMAND     ""
-            UPDATE_COMMAND      ""
-        )
-        SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
-        SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
-    ELSE(WIN32)
-        # swig uses bison find it by cmake and pass it down
-        FIND_PACKAGE(BISON)
-
-        # From SWIG configure
-        ExternalProject_Add(swig
-            GIT_REPOSITORY      https://github.com/swig/swig.git
-            GIT_TAG             rel-3.0.10
-            PREFIX              ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   cd <SOURCE_DIR> && ./autogen.sh && ./configure
-                                --prefix=${SWIG_INSTALL_DIR} --without-pcre
-            BUILD_COMMAND       cd <SOURCE_DIR> && make
-            INSTALL_COMMAND     cd <SOURCE_DIR> && make install
-            UPDATE_COMMAND      ""
-        )
-
-        SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
-        SET(SWIG_EXECUTABLE ${SWIG_INSTALL_DIR}/bin/swig)
-    ENDIF(WIN32)
-
-    LIST(APPEND external_project_dependencies swig)
-ENDIF(NOT SWIG_FOUND)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7b937c93fe..7a25aaf15f 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-IF(MOBILE_INFERENCE)
-    return()
-ENDIF()
-
 INCLUDE(ExternalProject)
 
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index c3e1212d8f..a0f300c2e8 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -73,12 +73,3 @@ include_directories(${XXHASH_INCLUDE_DIR})
 add_dependencies(xxhash extern_xxhash)
 
 LIST(APPEND external_project_dependencies xxhash)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
-  IF(ANDROID)
-    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
-  ENDIF()
-ENDIF()
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index d350737537..6c8d79c25e 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -59,12 +59,3 @@ SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 ADD_DEPENDENCIES(zlib extern_zlib)
 
 LIST(APPEND external_project_dependencies zlib)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
-  IF(ANDROID)
-    INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib)
-  ENDIF()
-ENDIF()
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index c4472040ce..9e6c47f016 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -156,10 +156,8 @@ set(GPU_COMMON_FLAGS
 endif(NOT WIN32)
 
 if (APPLE)
-    if(NOT CMAKE_CROSSCOMPILING)
-        # On Mac OS X build fat binaries with x86_64 architectures by default.
-        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
-    endif()
+    # On Mac OS X build fat binaries with x86_64 architectures by default.
+    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
     # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
     set (COMMON_FLAGS -Wno-deprecated-register)
 endif(APPLE)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 3f1be11d85..6679a09dfc 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -90,11 +90,11 @@
 # including binary directory for generated headers.
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-if(NOT APPLE AND NOT ANDROID)
+if(NOT APPLE)
   find_package(Threads REQUIRED)
   link_libraries(${CMAKE_THREAD_LIBS_INIT})
   set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
-endif(NOT APPLE AND NOT ANDROID)
+endif(NOT APPLE)
 
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME)
     endif()
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     # No unit test should exceed 10 minutes.
     set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME)
     endif()
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
   endif()
 endfunction(nv_test)
@@ -655,12 +657,6 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
   set(${SRCS})
   set(${HDRS})
 
-  if (MOBILE_INFERENCE)
-      set(EXTRA_FLAG "lite:")
-  else()
-      set(EXTRA_FLAG "")
-  endif()
-
   foreach(FIL ${ARGN})
     get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
     get_filename_component(FIL_WE ${FIL} NAME_WE)
@@ -677,7 +673,7 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
       COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
       COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
       -I${CMAKE_CURRENT_SOURCE_DIR}
-      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
+      --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
       DEPENDS ${ABS_FIL} protoc
       COMMENT "Running C++ protocol buffer compiler on ${FIL}"
       VERBATIM )
@@ -714,9 +710,10 @@ function(py_test TARGET_NAME)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
     add_test(NAME ${TARGET_NAME}
              COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-             FLAGS_cpu_deterministic=true
+             FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296  # 4G
              PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 3e11d332ff..a7dce4dfdb 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -149,25 +149,23 @@ if (WITH_NGRAPH)
             )
 endif ()
 
-if (NOT MOBILE_INFERENCE AND NOT RPI)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
-    copy(snappy_lib
-            SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS snappy)
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
+copy(snappy_lib
+        SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS snappy)
 
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
-    copy(snappystream_lib
-            SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS snappystream)
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
+copy(snappystream_lib
+        SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS snappystream)
 
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
-    copy(zlib_lib
-            SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS zlib)
-endif ()
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+copy(zlib_lib
+        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS zlib)
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
diff --git a/cmake/system.cmake b/cmake/system.cmake
index c91ef91127..65db05bebe 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -74,21 +74,6 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
-# configuration for cross-compiling
-IF(DEFINED CMAKE_SYSTEM_NAME)
-    INCLUDE(cross_compiling/host)
-    IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
-        SET(ANDROID TRUE)
-        INCLUDE(cross_compiling/android)
-    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
-        SET(RPI TRUE)
-        INCLUDE(cross_compiling/raspberry_pi)
-    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-        SET(IOS TRUE)
-        INCLUDE(cross_compiling/ios)
-    ENDIF()
-ENDIF()
-
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 0dc33ce385..02667dbce6 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -53,118 +53,3 @@ function(target_circle_link_libraries TARGET_NAME)
                 "-Wl,--end-group")
     endif()
 endfunction()
-
-# compile_cu_as_cpp
-# Make a cu file compiled as C++
-# Arguments: Source files
-macro(compile_cu_as_cpp)
-    foreach(s ${ARGN})
-        set_source_files_properties(${s} PROPERTIES LANGUAGE CXX)
-        set_source_files_properties(${s} PROPERTIES COMPILE_FLAGS "-x c++")
-    endforeach()
-endmacro()
-
-# link_paddle_exe
-# add paddle library for a paddle executable, such as trainer, pserver.
-#
-# It will handle WITH_PYTHON etc.
-function(link_paddle_exe TARGET_NAME)
-    if(WITH_RDMA)
-        generate_rdma_links()
-    endif()
-
-    if(MOBILE_INFERENCE)
-        target_circle_link_libraries(${TARGET_NAME}
-            ARCHIVE_START
-            paddle_gserver
-            paddle_function
-            ARCHIVE_END
-            paddle_math
-            paddle_utils
-            paddle_parameter
-            paddle_proto
-            paddle_cuda
-            ${EXTERNAL_LIBS}
-            ${CMAKE_THREAD_LIBS_INIT}
-            ${CMAKE_DL_LIBS}
-            ${RDMA_LD_FLAGS}
-            ${RDMA_LIBS})
-    else()
-        target_circle_link_libraries(${TARGET_NAME}
-            ARCHIVE_START
-            paddle_gserver
-            paddle_function
-            ARCHIVE_END
-            paddle_pserver
-            paddle_trainer_lib
-            paddle_network
-            paddle_math
-            paddle_utils
-            paddle_parameter
-            paddle_proto
-            paddle_cuda
-            paddle_optimizer
-            ${EXTERNAL_LIBS}
-            ${CMAKE_THREAD_LIBS_INIT}
-            ${CMAKE_DL_LIBS}
-            ${RDMA_LD_FLAGS}
-            ${RDMA_LIBS})
-    endif()
-
-    if(ANDROID)
-        target_link_libraries(${TARGET_NAME} log)
-    endif(ANDROID)
-
-    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
-      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
-    endif()
-
-    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
-endfunction()
-
-# link_paddle_test
-# Link a paddle unittest for target
-# TARGET_NAME: the unittest target name
-# Rest Arguemnts: not used.
-function(link_paddle_test TARGET_NAME)
-    link_paddle_exe(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME}
-                          paddle_test_main
-                          paddle_test_util
-                          ${GTEST_LIBRARIES})
-endfunction()
-
-# add_unittest_without_exec
-#
-# create a paddle unittest. not specifically define how to run this unittest.
-# TARGET_NAME: the unittest target name, same as executable file name
-# Rest Arguments: the source files to compile this unittest.
-macro(add_unittest_without_exec TARGET_NAME)
-    add_executable(${TARGET_NAME} ${ARGN})
-    link_paddle_test(${TARGET_NAME})
-endmacro()
-
-# add_unittest
-# create a paddle unittest and just to execute this binary to make unittest.
-#
-# TARGET_NAME: the unittest target name, same as executable file name
-# Rest Arguments: the source files to compile this unittest.
-macro(add_unittest TARGET_NAME)
-    add_unittest_without_exec(${TARGET_NAME} ${ARGN})
-    add_test(${TARGET_NAME} ${TARGET_NAME})
-endmacro()
-
-# add_simple_unittest
-# create a paddle unittest with file name. It just compile ${TARGET_NAME}.cpp to
-# ${TARGET_NAME} and then execute it.
-macro(add_simple_unittest TARGET_NAME)
-    add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp)
-endmacro()
-
-# Creates C resources file from files in given resource file
-function(create_resources res_file output_file)
-  add_custom_command(
-    OUTPUT ${output_file}
-    COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
-    DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
-endfunction()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 0a4edea2c3..afd3342768 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -45,6 +45,7 @@ paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], vararg
 paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
 paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
@@ -66,6 +67,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var
 paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
 paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
@@ -120,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
 paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
 paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
-paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
@@ -196,7 +198,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -211,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
 paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
@@ -317,6 +320,7 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp
 paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
 paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
 paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
+paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -356,6 +360,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
+paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
 paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
 paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index d88d9e783e..018ccd4047 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 #windows treat symbolic file as a real file, which is different with unix
 #We create a hidden file and compile it instead of origin source file.
 function(windows_symbolic TARGET)
@@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version)
 
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
 
-if(WITH_NGRAPH)
-  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-  cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-             shape_inference data_transform lod_tensor profiler)
-endif(WITH_NGRAPH)
-
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
@@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE)
 
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
 else()
-  if(WITH_NGRAPH)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
-  else(WITH_NGRAPH)
+  if (WITH_NGRAPH)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
+  else ()
     cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-  endif(WITH_NGRAPH)
+  endif()
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 
@@ -215,3 +207,24 @@ endif (NOT WIN32)
 
 cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
 cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
+
+# Get the current working branch
+execute_process(
+  COMMAND git rev-parse --abbrev-ref HEAD
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_BRANCH
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Get the latest abbreviated commit hash of the working branch
+execute_process(
+  COMMAND git log -1 --format=%h
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_COMMIT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+message(STATUS "commit: ${PADDLE_COMMIT}")
+message(STATUS "branch: ${PADDLE_BRANCH}")
+
+configure_file(commit.h.in commit.h)
diff --git a/paddle/fluid/framework/commit.h.in b/paddle/fluid/framework/commit.h.in
new file mode 100644
index 0000000000..3a33ece624
--- /dev/null
+++ b/paddle/fluid/framework/commit.h.in
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+
+static std::string paddle_commit() {
+  return "@PADDLE_COMMIT@";
+}
+
+static std::string paddle_compile_branch() {
+  return "@PADDLE_BRANCH@";
+}
+
+static std::string paddle_version() {
+  return "@PADDLE_VERSION@";
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 38c03a2604..dae5194744 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
 namespace paddle {
@@ -282,3 +283,4 @@ USE_PASS(modify_op_lock_and_record_event_pass);
 USE_PASS(inplace_pass);
 USE_PASS(lock_free_optimize_pass);
 USE_PASS(graph_print_pass);
+USE_PASS(graph_to_program_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 11a80d5f91..649b129161 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -93,7 +93,7 @@ struct BuildStrategy {
   int num_trainers_{1};
   int trainer_id_{0};
   std::vector<std::string> trainers_endpoints_;
-  bool remove_unnecessary_lock_{false};
+  bool remove_unnecessary_lock_{true};
 
   // NOTE:
   // Before you add new options, think if it's a general strategy that works
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 37b07e5736..318694a1d4 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,6 +25,9 @@ struct ExecutionStrategy {
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
+  // If we set this to 1, we will delete all variables when finish a batch. and
+  // this will loss 15%+ performance.
+  // Please be aware about this parameters.
   size_t num_iteration_per_drop_scope_{1};
   ExecutorType type_{kDefault};
   bool dry_run_{false};
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index c93bbe7cee..4323883fa5 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 #ifdef PADDLE_WITH_NGRAPH
-#include "paddle/fluid/framework/ngraph_operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
 #endif
 
 DECLARE_bool(benchmark);
@@ -133,24 +133,6 @@ static void DeleteUnusedTensors(
   }
 }
 
-static void EnableFusedOp(ExecutorPrepareContext* ctx) {
-#ifdef PADDLE_WITH_NGRAPH
-  VLOG(3) << "use_ngraph=True";
-  auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
-  for (auto& interval : intervals) {
-    auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
-                                     interval.at(1));
-    *interval[0] = std::unique_ptr<OperatorBase>(ng_op);
-  }
-  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
-    ctx->ops_.erase(it->at(0) + 1, it->at(1));
-  }
-#else
-  LOG(WARNING)
-      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
-#endif
-}
-
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 void Executor::Close() {
@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool create_local_scope, bool create_vars) {
   platform::RecordBlock b(block_id);
   if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
+#ifdef PADDLE_WITH_NGRAPH
+  if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
+#endif
   auto ctx = Prepare(pdesc, block_id);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
-  if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
   return ctx;
 }
 
diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc
index 2ee12cc410..929d9edc34 100644
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 
+#include <set>
 #include <vector>
 
 namespace paddle {
@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
   }
 
   std::unordered_set<Node *> visited;
-  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+  std::set<Node *> to_visit{source.begin(), source.end()};
 
   std::vector<Node *> inlink_visited;
   while (!to_visit.empty()) {
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 6cf405efe6..33ccee6aa0 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -28,10 +28,14 @@ std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
     PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
                    attr);
   }
+  auto* native_graph = graph.get();
   auto applied_graph = ApplyImpl(std::move(graph));
   // TODO(panyx0718): Add more verifications.
   PADDLE_ENFORCE(!HasCircle(*applied_graph),
                  "Illegal Pass. Generated graph shouldn't has cycle.");
+  PADDLE_ENFORCE(applied_graph.get() == native_graph,
+                 "Pass::Apply() cannot delete the passed graph and shouldn't "
+                 "return a new graph.(For the need of pybind11)");
   applied_ = true;
   return applied_graph;
 }
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 8fbbc6584e..f46bdf96ba 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 
 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   if (!platform::is_cpu_place(t.place())) {
-    LoDTensor tt;
-    framework::TensorCopy(t, platform::CPUPlace(), &tt);
+    LoDTensor cpu_tensor;
+    cpu_tensor.set_lod(t.lod());
+    framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor);
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(t.place());
     dev_ctx.Wait();
 
-    os << tt;
+    os << cpu_tensor;
     return os;
   }
 
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index c3a044d22c..5d854cb8d7 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
deleted file mode 100644
index 7e174c7def..0000000000
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ /dev/null
@@ -1,545 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include <algorithm>
-#include <map>
-
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/ngraph_bridge.h"
-#include "paddle/fluid/framework/ngraph_operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "ngraph/ngraph.hpp"
-
-namespace paddle {
-namespace framework {
-
-static ngraph::Shape Ddim2Shape(const DDim& dims) {
-  ngraph::Shape sp;
-  for (int i = 0; i < dims.size(); ++i) {
-    int k = dims[i];
-    k = k == 0 ? 1 : k;
-    sp.push_back(k);
-  }
-  return sp;
-}
-
-static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
-    {proto::VarType::FP32, ngraph::element::f32},
-    {proto::VarType::FP64, ngraph::element::f64},
-    {proto::VarType::INT32, ngraph::element::i32},
-    {proto::VarType::INT64, ngraph::element::i64},
-    {proto::VarType::BOOL, ngraph::element::boolean},
-};
-
-typedef enum {                /* nGraph support state on ops          */
-               FULL_TRAIN,    /* Support full ops for train           */
-               PARTIAL_TRAIN, /* Support partial ops for train        */
-               FULL_TEST,     /* Support full list of ops for test    */
-               PARTIAL_TEST   /* Support partial list of ops for test */
-} op_state;
-
-// perform graph build through bridge and execute computation
-class NgraphEngine {
- public:
-  explicit NgraphEngine(const Scope& scope, const platform::Place& place,
-                        const std::vector<std::shared_ptr<OperatorBase>>& ops,
-                        const std::unordered_map<
-                            std::string, ngraph::element::Type>& var_type_map,
-                        const std::unordered_set<std::string>& persist,
-                        const std::unordered_set<std::string>& fetches,
-                        const std::unordered_set<std::string>& post_op_inputs,
-                        op_state ng_op_state)
-      : scope_(scope),
-        place_(place),
-        fused_ops_(ops),
-        var_type_map_(var_type_map),
-        persistables_(persist),
-        fetches_(fetches),
-        post_op_inputs_(post_op_inputs),
-        ng_op_state_(ng_op_state) {
-    var_in_node_map_ = std::make_shared<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-
-    var_node_map_ = std::make_shared<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
-
-    BuildNgIO();
-
-    GetNgFunction();
-  }
-
-  void Run(const Scope& scope, const platform::Place& place) const;
-
- private:
-  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache_;
-  const Scope& scope_;
-  const platform::Place& place_;
-  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
-  std::unordered_set<std::string> post_op_inputs_;
-  op_state ng_op_state_;
-
-  // ngraph backend eg. CPU
-  static std::shared_ptr<ngraph::runtime::Backend> backend_;
-  // ngraph function to call and execute
-  std::shared_ptr<ngraph::Function> ngraph_function_;
-  // var_name of inputs
-  std::vector<std::string> var_in_;
-  // var_name of outputs from  fetch in order
-  std::vector<std::string> var_out_;
-  // map input vars to nodes
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      var_in_node_map_;
-  // map each var name with a ngraph node
-  std::shared_ptr<
-      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      var_node_map_;
-  // cache key to check if function is cached
-  std::shared_ptr<std::string> GetCacheKey();
-  // get ngraph input and define ngraph input parameters
-  void GetNgInputShape(std::shared_ptr<OperatorBase> op);
-  // Call ngraph bridge to map ops
-  void BuildNgNodes();
-  // get the ngraph input and output var list
-  void BuildNgIO();
-  // build ngraph function call
-  void BuildNgFunction();
-  // Check cache for ngraph function or otherwise build the function
-  void GetNgFunction();
-};
-
-std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-NgraphOperator::NgraphOpIntervals(
-    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
-  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-      intervals;
-  if (ops->empty()) {
-    return intervals;
-  }
-  size_t size = ops->size();
-  size_t left = 0;
-  while (left < size && ops->at(left)->Type() != kFeedOpType) {
-    ++left;
-  }
-  if (left == size) {
-    return intervals;
-  }
-  while (left < size && ops->at(left)->Type() == kFeedOpType) {
-    ++left;
-  }
-
-  size_t right = left;
-  while (right < size && ops->at(right)->Type() != kFetchOpType) {
-    ++right;
-  }
-  if (right == size) {
-    return intervals;
-  }
-  if (left >= right) return intervals;
-
-  // (left, right - 1) represents indices between feed and fetch
-  size_t pivot = left;
-  while (pivot < right) {
-    auto op_type = ops->at(pivot)->Type();
-    if (paddle::framework::NgraphBridge::NG_NODE_MAP.find(op_type) ==
-        paddle::framework::NgraphBridge::NG_NODE_MAP.end()) {
-      ++pivot;
-    } else {
-      size_t start = pivot, end = start;
-      while (pivot < right &&
-             (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
-                  ops->at(pivot)->Type()) !=
-              paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
-        ++pivot;
-        ++end;
-      }
-      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>
-          interval = {ops->begin() + start, ops->begin() + end};
-      intervals.push_back(interval);
-    }
-  }  // end while
-
-  return intervals;
-}
-
-NgraphOperator::NgraphOperator(
-    const ProgramDesc& prog, size_t block_id,
-    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
-    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
-    const std::string& type, const VariableNameMap& inputs,
-    const VariableNameMap& outputs, const AttributeMap& attrs)
-    : OperatorBase(type, inputs, outputs, attrs),
-      pdesc_(prog),
-      block_(block_id) {
-  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
-       it != end; ++it) {
-    fused_ops_.push_back(std::move(*it));
-  }
-
-  for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = end;
-       (*it)->Type() != kFetchOpType; ++it) {
-    for (auto& var_name_item : (*it)->Inputs()) {
-      for (auto& var_name : var_name_item.second) {
-        post_op_inputs_.insert(var_name);
-      }
-    }
-  }
-
-  if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
-    is_full_ = true;
-  }
-
-  Process();
-}
-
-void NgraphOperator::Process() {
-  auto& bdesc = pdesc_.Block(block_);
-  for (auto& var : bdesc.AllVars()) {
-    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
-          var->GetType() == proto::VarType::LOD_TENSOR ||
-          var->GetType() == proto::VarType::LOD_TENSOR_ARRAY)) {
-      continue;
-    }
-
-    auto var_name = var->Name();
-    if (var->Name() == framework::kEmptyVarName) {
-      continue;
-    }
-
-    if (var_name != "fetch" && var_name != "feed") {
-      auto pd_type = var->GetDataType();
-      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
-        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
-                     var_name);
-      }
-      var_type_map_[var_name] = pd2ng_type_map[pd_type];
-    }
-
-    if (var->Persistable()) {
-      persistables_.insert(var->Name());
-    }
-  }
-
-  for (auto* op : bdesc.AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      std::string fetch_target_name = op->Input("X")[0];
-      fetches_.insert(fetch_target_name);
-    }
-  }
-}
-
-void NgraphOperator::RunImpl(const Scope& scope,
-                             const platform::Place& place) const {
-  op_state ng_op_state = PARTIAL_TEST;
-  auto& bdesc = pdesc_.Block(block_);
-  for (auto* op : bdesc.AllOps()) {
-    if (op->Type().find("_grad") != std::string::npos) {
-      ng_op_state = PARTIAL_TRAIN;
-      break;
-    }
-  }
-
-  if (is_full_) {
-    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
-  }
-
-  NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_,
-                             persistables_, fetches_, post_op_inputs_,
-                             ng_op_state);
-  ngraph_engine.Run(scope, place);
-}
-
-std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-    NgraphEngine::func_cache_ = {};
-
-std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
-    ngraph::runtime::Backend::create("CPU");
-
-void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
-  RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
-  op->RuntimeInferShape(scope_, place_, ctx);
-  for (auto& var_name_item : op->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope_.FindVar(var_name);
-      if (var && var->IsType<LoDTensor>()) {
-        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-        auto sp = Ddim2Shape(tensor_pd->dims());
-        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
-            var_in_.end()) {
-          if (var_node_map_->find(var_name) == var_node_map_->end()) {
-            auto ng_type = var_type_map_.at(var_name);
-            auto prm =
-                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
-            (*var_node_map_)[var_name] = prm;
-            (*var_in_node_map_)[var_name] = prm;
-          }
-        }
-      }
-    }
-  }
-}
-
-void NgraphEngine::BuildNgNodes() {
-  for (auto& var_name : var_out_) {
-    if (var_node_map_->find(var_name) == var_node_map_->end()) {
-      auto* var = scope_.FindVar(var_name);
-      if (var && var->IsType<LoDTensor>()) {
-        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-        auto& ddim = tensor_pd->dims();
-        auto ng_shape = Ddim2Shape(ddim);
-        auto ng_type = var_type_map_.at(var_name);
-        auto prm =
-            std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
-        (*var_node_map_)[var_name] = prm;
-      }
-    }
-  }
-
-  paddle::framework::NgraphBridge ngb(var_node_map_);
-  for (auto& op : fused_ops_) {
-    ngb.BuildNgNode(op);
-  }
-}
-
-void NgraphEngine::BuildNgIO() {
-  std::unordered_set<std::string> inputs;
-  std::unordered_set<std::string> outputs;
-
-  for (auto& op : fused_ops_) {
-    for (auto& var_name_item : op->Inputs()) {
-      for (auto& var_name : var_name_item.second) {
-        inputs.insert(var_name);
-        const bool is_output = outputs.find(var_name) != outputs.end();
-        if (!is_output &&
-            std::find(var_in_.begin(), var_in_.end(), var_name) ==
-                var_in_.end()) {
-          // fill var_in here to keep lhs and rhs order
-          var_in_.push_back(var_name);
-        }
-      }
-    }
-
-    if (op->Type() != "fill_constant") {
-      GetNgInputShape(op);
-    }
-
-    for (auto& var_name_item : op->Outputs()) {
-      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
-                        "op %s has more than 1 output - Not handling yet",
-                        op->Type());
-      for (auto& var_name : var_name_item.second) {
-        outputs.insert(var_name);
-      }
-    }
-  }
-
-  // var_out.clear();
-  for (auto& op : fused_ops_) {
-    for (auto& var_name_item : op->Outputs()) {
-      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
-                        "op %s has more than 1 output - Not handling yet",
-                        op->Type());
-      for (auto& var_name : var_name_item.second) {
-        switch (ng_op_state_) {
-          case PARTIAL_TEST:
-            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
-                fetches_.find(var_name) != fetches_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          case FULL_TEST:
-            if (fetches_.find(var_name) != fetches_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          case PARTIAL_TRAIN:
-            if (fetches_.find(var_name) != fetches_.end() ||
-                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
-                persistables_.find(var_name) != persistables_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          case FULL_TRAIN:
-            if (fetches_.find(var_name) != fetches_.end() ||
-                persistables_.find(var_name) != persistables_.end()) {
-              var_out_.push_back(var_name);
-            }
-            break;
-          default:
-            var_out_.push_back(var_name);
-        }
-      }
-    }
-  }
-}
-
-void NgraphEngine::BuildNgFunction() {
-  BuildNgNodes();
-  ngraph_function_ = nullptr;
-  ngraph::NodeVector func_outputs;
-  ngraph::ParameterVector func_inputs;
-
-  for (auto& vo : var_out_) {
-    func_outputs.push_back(var_node_map_->at(vo));
-  }
-
-  for (auto& vi : var_in_) {
-    std::shared_ptr<ngraph::op::Parameter> prm =
-        std::dynamic_pointer_cast<ngraph::op::Parameter>(
-            var_in_node_map_->at(vi));
-    func_inputs.push_back(prm);
-  }
-
-  ngraph_function_ =
-      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
-}
-
-std::shared_ptr<std::string> NgraphEngine::GetCacheKey() {
-  auto cache_key = std::make_shared<std::string>("");
-  *cache_key += std::to_string(fused_ops_.size());
-  for (auto& op : fused_ops_) {
-    *cache_key += op->Type();
-  }
-  for (auto& var_name : var_in_) {
-    auto shape = var_node_map_->at(var_name)->get_shape();
-    *cache_key += var_name;
-    *cache_key += var_type_map_.at(var_name).c_type_string();
-    for (size_t i = 0; i < shape.size(); ++i) {
-      *cache_key += std::to_string(shape.at(i));
-    }
-  }
-
-  for (auto& var_name : var_out_) {
-    auto* var = scope_.FindVar(var_name);
-    if (var && var->IsType<LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      auto& ddim = tensor_pd->dims();
-      for (int i = 0; i < ddim.size(); ++i) {
-        *cache_key += std::to_string(ddim[i]);
-      }
-    }
-  }
-  return cache_key;
-}
-
-void NgraphEngine::GetNgFunction() {
-  bool cache_on = true;
-  if (cache_on) {
-    std::string cache_key_val = *GetCacheKey();
-    if (func_cache_.find(cache_key_val) != func_cache_.end()) {
-      ngraph_function_ = func_cache_.at(cache_key_val);
-    } else {
-      BuildNgFunction();
-      func_cache_[cache_key_val] = ngraph_function_;
-    }
-  } else {
-    BuildNgFunction();
-  }
-}
-
-void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
-  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
-
-  for (size_t i = 0; i < var_in_.size(); ++i) {
-    auto vi = var_in_.at(i);
-    auto sp = var_node_map_->at(vi)->get_shape();
-    std::shared_ptr<ngraph::runtime::Tensor> ti;
-    auto* var = scope.FindVar(vi);
-    if (var && var->IsType<LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
-                     "Ensure ngraph tensor layout align with paddle tensor");
-      if (tensor_pd->type() == proto::VarType::FP32) {
-        const float* arr = tensor_pd->data<float>();
-        ti = backend_->create_tensor(ngraph::element::f32, sp,
-                                     const_cast<float*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::INT32) {
-        const int* arr = tensor_pd->data<int>();
-        ti = backend_->create_tensor(ngraph::element::i32, sp,
-                                     const_cast<int*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::INT64) {
-        const int64_t* arr = tensor_pd->data<int64_t>();
-        ti = backend_->create_tensor(ngraph::element::i64, sp,
-                                     const_cast<int64_t*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::FP64) {
-        const double* arr = tensor_pd->data<double>();
-        ti = backend_->create_tensor(ngraph::element::f64, sp,
-                                     const_cast<double*>(arr));
-      } else if (tensor_pd->type() == proto::VarType::BOOL) {
-        const bool* arr = tensor_pd->data<bool>();
-        ti = backend_->create_tensor(ngraph::element::boolean, sp,
-                                     const_cast<bool*>(arr));
-      } else {
-        PADDLE_THROW("Data type not handling for var %s", vi);
-      }
-    } else {
-      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
-    }
-    bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
-                       ? true
-                       : false;
-    bool is_persistable =
-        (persistables_.find(vi) != persistables_.end()) ? true : false;
-    if (is_test && is_persistable) {
-      ti->set_stale(false);
-    }
-    t_in.push_back(ti);
-  }
-
-  for (size_t i = 0; i < var_out_.size(); ++i) {
-    auto var_name = var_out_[i];
-    auto* var = scope.FindVar(var_name);
-    std::shared_ptr<ngraph::runtime::Tensor> to;
-    if (var && var->IsType<LoDTensor>()) {
-      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
-      auto dd = tensor_pd->dims();
-      ngraph::Shape sp = Ddim2Shape(dd);
-      auto ng_type = var_type_map_.at(var_name);
-      if (ng_type == ngraph::element::f32) {
-        auto pd_arr = tensor_pd->mutable_data<float>(place);
-        to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
-      } else if (ng_type == ngraph::element::i64) {
-        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
-        to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
-      } else if (ng_type == ngraph::element::f64) {
-        auto pd_arr = tensor_pd->mutable_data<double>(place);
-        to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
-      } else if (ng_type == ngraph::element::boolean) {
-        auto pd_arr = tensor_pd->mutable_data<bool>(place);
-        to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
-      } else {
-        PADDLE_THROW("Data type not handled in for var %s", var_name);
-      }
-      t_out.push_back(to);
-    } else {
-      PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
-    }
-  }
-
-  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
-}  // NgraphEngine::RunImpl
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h
deleted file mode 100644
index ede80f44be..0000000000
--- a/paddle/fluid/framework/ngraph_operator.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/variant.h"
-
-#include "ngraph/type/element_type.hpp"
-
-namespace paddle {
-namespace framework {
-
-class NgraphOperator : public OperatorBase {
- public:
-  static std::vector<
-      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-  NgraphOpIntervals(
-      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
-
-  explicit NgraphOperator(
-      const ProgramDesc& prog, size_t block_id,
-      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
-      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
-      const std::string& type = "fused_op", const VariableNameMap& inputs = {},
-      const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
-
-  void RunImpl(const Scope& scope, const platform::Place& place) const final;
-
- private:
-  const ProgramDesc pdesc_;
-  size_t block_;
-  std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
-  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
-  std::unordered_set<std::string> persistables_;
-  std::unordered_set<std::string> fetches_;
-  std::unordered_set<std::string> post_op_inputs_;
-  bool is_full_ = false;
-
-  void Process();
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 38f811c0e9..ab3cf308fc 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 #include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -1075,7 +1073,9 @@ Scope* OperatorWithKernel::PrepareData(
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
-  int data_type = -1;
+  proto::VarType::Type dafault_data_type =
+      static_cast<proto::VarType::Type>(-1);
+  proto::VarType::Type data_type = dafault_data_type;
   for (auto& input : this->inputs_) {
     const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
     for (size_t i = 0; i < vars.size(); ++i) {
@@ -1092,18 +1092,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
         if (t != nullptr) {
           PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
                          input.first, i);
-          int tmp = static_cast<int>(t->type());
+          proto::VarType::Type tmp = t->type();
           PADDLE_ENFORCE(
-              tmp == data_type || data_type == -1,
+              tmp == data_type || data_type == dafault_data_type,
               "DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
-              Type(), data_type, tmp);
+              Type(), DataTypeToString(data_type), DataTypeToString(tmp));
           data_type = tmp;
         }
       }
     }
   }
-  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-  return static_cast<proto::VarType::Type>(data_type);
+  PADDLE_ENFORCE(data_type != dafault_data_type,
+                 "DataType should be indicated by input");
+  return data_type;
 }
 
 OpKernelType OperatorWithKernel::GetExpectedKernelType(
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index ce3ad18b1f..ef5404e475 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -25,7 +25,8 @@ inline const T* Tensor::data() const {
   check_memory_size();
   bool valid =
       std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
-  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_);
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d",
+                 DataTypeToString(type_));
 
   return reinterpret_cast<const T*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index a730b84a91..5db4221199 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator)
-cc_library(tracer SRCS tracer.cc DEPS proto_desc)
+cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context)
 cc_library(engine SRCS engine.cc)
 endif()
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index b7df4b8886..83fc6ee2e2 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/layer.h"
+
 #include <deque>
 #include <limits>
 #include <map>
@@ -22,6 +23,9 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -34,22 +38,66 @@ std::map<int, py::object> py_funcs_;
 
 using framework::Variable;
 
-void AddTo(Variable* src, Variable* dst) {
-  framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
-  framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+namespace detail {
+
+template <typename T>
+class TensorAddToFunctor : public boost::static_visitor<> {
+ public:
+  TensorAddToFunctor(int64_t numel, const T* x, T* y)
+      : numel_(numel), x_(x), y_(y) {}
+
+  void operator()(const platform::CPUPlace& place) {
+    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  void operator()(const platform::CUDAPlace& place) {
+    platform::CUDADeviceContext* ctx =
+        dynamic_cast<platform::CUDADeviceContext*>(
+            platform::DeviceContextPool::Instance().Get(place));
+    auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
+    blas.AXPY(numel_, 1., x_, y_);
+  }
+#else
+  void operator()(const platform::CUDAPlace& place) {
+    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+  }
+#endif
+
+  // there is NO blas in CUDAPinnedPlace
+  void operator()(const platform::CUDAPinnedPlace& place) {
+    PADDLE_THROW("Do NOT support gradient merge in place %s", place);
+  }
+
+ private:
+  int64_t numel_;
+  const T* x_;
+  T* y_;
+};
+
+}  // namespace detail
+
+void AddTo(Variable* src, Variable* dst, platform::Place place) {
+  framework::Tensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  framework::Tensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+
   // FIXME(minqiyang): loss_grad op will pass a zero grad of label
   // ugly fix for it
   if (src_tensor->numel() == 0) {
     return;
   }
+
   PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
                  "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
                  src_tensor->numel());
-  float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
-  const float* src_data = src_tensor->data<float>();
-  for (int64_t i = 0; i < src_tensor->numel(); ++i) {
-    dst_data[i] += src_data[i];
-  }
+
+  detail::TensorAddToFunctor<float> func(
+      src_tensor->numel(), src_tensor->data<float>(),
+      dst_tensor->mutable_data<float>(place));
+  boost::apply_visitor(func, place);
 }
 
 class Autograd {
@@ -120,66 +168,104 @@ class Autograd {
   }
 };
 
+std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
+                                             const bool blocking) const {
+  PADDLE_ENFORCE(var_->IsInitialized(),
+                 "Variable must be initialized when getting numpy tensor");
+
+  std::unique_ptr<VarBase> new_var(new VarBase());
+  framework::LoDTensor* tensor =
+      new_var->var_->GetMutable<framework::LoDTensor>();
+  tensor->Resize(var_->Get<framework::LoDTensor>().dims());
+  tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
+
+  if (blocking) {
+    platform::DeviceContext* dev_ctx =
+        platform::DeviceContextPool::Instance().Get(dst_place);
+
+    framework::TensorCopySync(var_->Get<framework::LoDTensor>(), dst_place,
+                              tensor);
+
+    dev_ctx->Wait();
+  } else {
+    framework::TensorCopy(var_->Get<framework::LoDTensor>(), dst_place, tensor);
+  }
+
+  if (platform::is_gpu_place(dst_place)) {
+    VLOG(3) << "copy tensor " << var_desc_->Name() << " from gpu";
+  }
+
+  return new_var;
+}
+
 framework::LoDTensor& VarBase::GradValue() {
   VLOG(3) << "get var grad " << var_desc_->Name();
   return *(grads_->var_->GetMutable<framework::LoDTensor>());
 }
 
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  if (!grad_op_desc_ && backward_id_ <= 0) {
+  if (grad_op_descs_.empty() && backward_id_ <= 0) {
     LOG(WARNING) << "op with no grad: " << op_desc_->Type();
     return {};
   }
 
-  std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
+  std::vector<framework::VariableValueMap> grad_outputs;
   if (backward_id_ > 0) {
     VLOG(3) << "py_layer_grad";
-    grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad(
-        backward_id_,
-        grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]);
+    grad_outputs.resize(1);
+    grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
+        PyLayer::ApplyGrad(
+            backward_id_,
+            grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
   } else {
-    VLOG(3) << "op grad " << grad_op_desc_->Type();
-    for (auto it : grad_output_vars_) {
-      auto& outputs = grad_outputs[it.first];
-      for (size_t i = 0; i < it.second.size(); ++i) {
-        // Allocate a new variable
-        Variable* tmp_var = new framework::Variable();
-        tmp_var->GetMutable<framework::LoDTensor>();
-        outputs.push_back(tmp_var);
+    grad_outputs.resize(grad_op_descs_.size());
+    for (size_t k = 0; k < grad_op_descs_.size(); ++k) {
+      framework::OpDesc* grad_op_desc = grad_op_descs_[k];
+      VLOG(3) << "op grad " << grad_op_desc->Type();
+      for (auto it : grad_output_vars_[k]) {
+        auto& outputs = grad_outputs[k][it.first];
+        for (size_t i = 0; i < it.second.size(); ++i) {
+          // Allocate a new variable
+          Variable* tmp_var = new framework::Variable();
+          tmp_var->GetMutable<framework::LoDTensor>();
+          outputs.push_back(tmp_var);
+        }
       }
-    }
 
-    framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
+      framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]);
 
-    // No need to do compile time infer shape here.
-    // grad_op_desc_->InferShape(*block_);
-    grad_op_desc_->InferVarType(block_);
+      // No need to do compile time infer shape here.
+      // grad_op_desc_->InferShape(*block_);
+      grad_op_desc->InferVarType(block_);
 
-    std::unique_ptr<framework::OperatorBase> opbase =
-        framework::OpRegistry::CreateOp(*grad_op_desc_);
-    framework::OperatorWithKernel* op_kernel =
-        dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
-    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+      std::unique_ptr<framework::OperatorBase> opbase =
+          framework::OpRegistry::CreateOp(*grad_op_desc);
+      framework::OperatorWithKernel* op_kernel =
+          dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
+      PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
 
-    framework::Scope scope;
-    platform::CPUPlace place;
-    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
-    p.op.RuntimeInferShape(scope, place, ctx);
-    p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+      framework::Scope scope;
+      PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
+      p.op.RuntimeInferShape(scope, place_, ctx);
+      p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+    }
   }
 
-  for (auto it : grad_output_vars_) {
-    auto& outputs = grad_outputs[it.first];
-    auto& origin_outputs = it.second;
-    PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
-
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      framework::Variable* grad = outputs[i];
-      framework::Variable* orig_grad = origin_outputs[i];
-      AddTo(grad, orig_grad);
-      delete grad;
+  for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
+    for (auto it : grad_output_vars_[k]) {
+      auto& outputs = grad_outputs[k][it.first];
+      auto& origin_outputs = it.second;
+      PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
+
+      for (size_t i = 0; i < outputs.size(); ++i) {
+        framework::Variable* grad = outputs[i];
+        framework::Variable* orig_grad = origin_outputs[i];
+        AddTo(grad, orig_grad, place_);
+        delete grad;
+      }
     }
   }
+
   return input_vars_;
 }
 
@@ -188,8 +274,10 @@ void VarBase::RunBackward() {
 
   VLOG(3) << "start backward";
   auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
-  float* data = grads_t->mutable_data<float>(platform::CPUPlace());
-  std::fill(data, data + grads_t->numel(), 1.0);
+  operators::math::set_constant(
+      *(platform::DeviceContextPool::Instance().Get(
+          var_->GetMutable<framework::LoDTensor>()->place())),
+      grads_t, 1.0);
 
   PADDLE_ENFORCE(
       grads_ ==
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 0b1077c640..dc97433a51 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -21,17 +21,21 @@
 #include <map>     // NOLINT
 #include <string>  // NOLINT
 #include <vector>  // NOLINT
+#include <memory>  // NOLINT
 
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/device_context.h"
 
 #include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
 namespace imperative {
 
+class VarBase;
+
 namespace py = ::pybind11;
 
 class PreparedOp {
@@ -81,6 +85,8 @@ class PreparedOp {
     return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
   }
 
+  inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; }
+
   const framework::OperatorBase& op;
   const framework::RuntimeContext& ctx;
   framework::OperatorWithKernel::OpKernelFunc func;
@@ -148,6 +154,9 @@ class VarBase {
 
   framework::LoDTensor& GradValue();
 
+  std::unique_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
+                                      const bool blocking) const;
+
   inline std::string GradName() const {
     PADDLE_ENFORCE(
         var_desc_,
@@ -175,11 +184,13 @@ class OpBase {
   OpBase()
       : op_desc_(nullptr),
         forward_id_(-1),
-        grad_op_desc_(nullptr),
-        backward_id_(-1) {}
+        backward_id_(-1),
+        place_(platform::CPUPlace()) {}
 
   virtual ~OpBase() {
-    if (grad_op_desc_) delete grad_op_desc_;
+    for (framework::OpDesc* desc : grad_op_descs_) {
+      delete desc;
+    }
   }
 
   std::map<std::string, std::vector<VarBase*>> ApplyGrad();
@@ -188,18 +199,25 @@ class OpBase {
   // For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
   framework::OpDesc* op_desc_;
   int forward_id_;
-  // When has backward, one of `grad_op_desc_` or `backward_id_` is set,
+
+  // When has backward, one of `grad_op_descs_` or `backward_id_` is set,
   // not both.
-  framework::OpDesc* grad_op_desc_;
+  // Note: each fwd op corresponds to a vector of bwd ops.
+  std::vector<framework::OpDesc*> grad_op_descs_;
   int backward_id_;
 
+  platform::Place place_;
+
   VarBasePtrMap input_vars_;
   VarBasePtrMap output_vars_;
   OpBasePtrMap pre_ops_;
   std::map<std::string, std::vector<int>> pre_ops_out_idx_;
 
-  framework::VariableValueMap grad_input_vars_;
-  framework::VariableValueMap grad_output_vars_;
+  // Inputs to a vector of bwd ops.
+  std::vector<framework::VariableValueMap> grad_input_vars_;
+  // Outputs to a vector of bwd ops.
+  std::vector<framework::VariableValueMap> grad_output_vars_;
+
   framework::BlockDesc* block_;
 };
 
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 843fee41f3..cd62807a55 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -14,33 +14,60 @@
 
 #include "paddle/fluid/imperative/tracer.h"
 
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace imperative {
 
 void CreateGradOp(const framework::OpDesc& op_desc,
                   const std::unordered_set<std::string>& no_grad_set,
                   const std::vector<framework::BlockDesc*>& grad_sub_block,
-                  framework::OpDesc** grad_op_desc,
+                  std::vector<framework::OpDesc*>* grad_op_descs,
                   std::unordered_map<std::string, std::string>* grad_to_var) {
-  std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+  PADDLE_ENFORCE(grad_op_descs->empty());
+  std::vector<std::unique_ptr<framework::OpDesc>> descs =
       framework::OpInfoMap::Instance()
           .Get(op_desc.Type())
           .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
-  PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
-  // TODO(panyx0718): Leak?
-  *grad_op_desc = grad_op_descs[0].release();
+  for (auto& desc : descs) {
+    grad_op_descs->emplace_back(desc.release());
+  }
 }
 
-void InitVar(framework::Variable* var, framework::Variable* grad_var) {
+void InitVar(framework::Variable* var, framework::Variable* grad_var,
+             platform::DeviceContext* dev_ctx) {
+  PADDLE_ENFORCE_NOT_NULL(dev_ctx,
+                          "Could not get valid device from forward op");
   auto& var_t = var->Get<framework::LoDTensor>();
-  float* data =
-      grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
-          var_t.dims(), platform::CPUPlace());
-  std::fill(data, data + var_t.numel(), 0.0);
+  grad_var->GetMutable<framework::LoDTensor>()->mutable_data<float>(
+      var_t.dims(), dev_ctx->GetPlace());
+  operators::math::set_constant(
+      *dev_ctx, grad_var->GetMutable<framework::LoDTensor>(), 0.0);
+}
+
+platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
+  platform::Place result = place;
+  for (auto it : inputs) {
+    for (VarBase* var : it.second) {
+      platform::Place tmp_place =
+          var->var_->Get<framework::LoDTensor>().place();
+      if (!platform::is_same_place(tmp_place, result)) {
+        PADDLE_THROW(
+            "Input variable should keep in the same place: %s, but get place: "
+            "%s of input %s instead",
+            result, tmp_place, it.first);
+      }
+    }
+  }
+
+  return result;
 }
 
 void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                    const VarBasePtrMap& outputs, framework::BlockDesc* block,
+                   const platform::Place expected_place,
                    const bool stop_gradient) {
   std::map<std::string, VarBase*> vars;
 
@@ -105,51 +132,59 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
 
   framework::Scope scope;
-  platform::CPUPlace place;
-  PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place);
-  p.op.RuntimeInferShape(scope, place, ctx);
-  p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
+  op->place_ = GetExpectedPlace(expected_place, inputs);
+  PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
+  prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
+  prepared_op.func(framework::ExecutionContext(
+      prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx));
 
   if (!stop_gradient) {
-    framework::OpDesc* grad_op_desc;
-    // TODO(panyx): Is this leaked?
     std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
         new std::unordered_map<std::string, std::string>());
-    CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get());
-    op->grad_op_desc_ = grad_op_desc;
-
-    for (auto it : grad_op_desc->Inputs()) {
-      auto& grad_in_vars = op->grad_input_vars_[it.first];
-      for (const std::string& grad_invar : it.second) {
-        block->FindRecursiveOrCreateVar(grad_invar);
-        auto var_it = grad_to_var->find(grad_invar);
-        if (var_it == grad_to_var->end()) {
-          auto fwd_var_it = vars.find(grad_invar);
-          PADDLE_ENFORCE(fwd_var_it != vars.end());
-          // Forward inputs or outputs.
-          grad_in_vars.push_back(fwd_var_it->second->var_);
-        } else {
-          VarBase* var = vars[var_it->second];
-          if (!var->grads_->var_->IsInitialized()) {
-            InitVar(var->var_, var->grads_->var_);
+    CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get());
+
+    op->grad_input_vars_.resize(op->grad_op_descs_.size());
+    op->grad_output_vars_.resize(op->grad_op_descs_.size());
+    for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) {
+      framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
+      for (auto it : grad_op_desc->Inputs()) {
+        auto& grad_in_vars = op->grad_input_vars_[i][it.first];
+        for (const std::string& grad_invar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_invar);
+          auto var_it = grad_to_var->find(grad_invar);
+          if (var_it == grad_to_var->end()) {
+            auto fwd_var_it = vars.find(grad_invar);
+            PADDLE_ENFORCE(fwd_var_it != vars.end());
+            // Forward inputs or outputs.
+            grad_in_vars.push_back(fwd_var_it->second->var_);
+          } else {
+            VarBase* var = vars[var_it->second];
+            if (!var->grads_->var_->IsInitialized()) {
+              InitVar(var->var_, var->grads_->var_,
+                      prepared_op.GetDeviceContext());
+            }
+            // Douts.
+            grad_in_vars.push_back(var->grads_->var_);
           }
-          // Douts.
-          grad_in_vars.push_back(var->grads_->var_);
         }
       }
-    }
 
-    for (auto it : grad_op_desc->Outputs()) {
-      auto& grad_out_vars = op->grad_output_vars_[it.first];
-      for (const std::string& grad_outvar : it.second) {
-        block->FindRecursiveOrCreateVar(grad_outvar);
-        auto var_it = grad_to_var->find(grad_outvar);
-        PADDLE_ENFORCE(var_it != grad_to_var->end());
-        VarBase* var = vars[var_it->second];
-        if (!var->grads_->var_->IsInitialized()) {
-          InitVar(var->var_, var->grads_->var_);
+      for (auto it : grad_op_desc->Outputs()) {
+        auto& grad_out_vars = op->grad_output_vars_[i][it.first];
+        for (const std::string& grad_outvar : it.second) {
+          block->FindRecursiveOrCreateVar(grad_outvar);
+          auto var_it = grad_to_var->find(grad_outvar);
+          PADDLE_ENFORCE(var_it != grad_to_var->end(),
+                         "Could not found the grad op output var, should this "
+                         "operator %s's stop gradient be True",
+                         op_desc->Type());
+          VarBase* var = vars[var_it->second];
+          if (!var->grads_->var_->IsInitialized()) {
+            InitVar(var->var_, var->grads_->var_,
+                    prepared_op.GetDeviceContext());
+          }
+          grad_out_vars.push_back(var->grads_->var_);
         }
-        grad_out_vars.push_back(var->grads_->var_);
       }
     }
   }
@@ -178,10 +213,12 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
     out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
   }
   if (!stop_gradient) {
+    op->grad_input_vars_.resize(1);
+    op->grad_output_vars_.resize(1);
     auto& grad_input_vars =
-        op->grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)];
+        op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)];
     auto& grad_output_vars =
-        op->grad_output_vars_[framework::GradVarName(PyLayer::kFwdOut)];
+        op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
 
     for (const VarBase* inp : inputs) {
       grad_input_vars.push_back(inp->var_);
@@ -189,16 +226,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
     for (VarBase* out : outputs) {
       grad_input_vars.push_back(out->var_);
     }
+
+    platform::CPUPlace place;
     for (VarBase* out : outputs) {
       grad_input_vars.push_back(out->grads_->var_);
       if (!grad_input_vars.back()->IsInitialized()) {
-        InitVar(out->var_, grad_input_vars.back());
+        // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
+        InitVar(out->var_, grad_input_vars.back(),
+                platform::DeviceContextPool::Instance().Get(place));
       }
     }
+
     for (const VarBase* inp : inputs) {
       grad_output_vars.push_back(inp->grads_->var_);
       if (!grad_output_vars.back()->IsInitialized()) {
-        InitVar(inp->var_, grad_output_vars.back());
+        // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
+        InitVar(inp->var_, grad_output_vars.back(),
+                platform::DeviceContextPool::Instance().Get(place));
       }
     }
   }
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index f225d8abe6..6908382155 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace imperative {
@@ -34,21 +35,25 @@ void CreateGradOp(const framework::OpDesc& op_desc,
 
 void InitVar(framework::Variable* var, framework::Variable* grad_var);
 
+platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
+
 class Tracer {
  public:
   explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
 
   virtual ~Tracer() {}
 
-  void Trace(OpBase* op,
-             const std::map<std::string, std::vector<VarBase*>>& inputs,
-             const std::map<std::string, std::vector<VarBase*>>& outputs,
-             framework::BlockDesc* block, const bool stop_gradient = false);
+  void Trace(OpBase* op, const VarBasePtrMap& inputs,
+             const VarBasePtrMap& outputs, framework::BlockDesc* block,
+             const platform::Place expected_place,
+             const bool stop_gradient = false);
 
   std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
                                 bool stop_gradient = false);
 
  private:
+  platform::Place GetPlace(const VarBasePtrMap& inputs);
+
   framework::BlockDesc* root_block_;
 };
 
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 88ce61f9b9..a2546ead93 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
@@ -130,10 +131,14 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
+  DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
+                      contrib::AnalysisConfig::Precision);
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
-  DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool);
+  DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);
+  DECL_ARGUMENT_FIELD(static_memory_optim_force_update,
+                      StaticMemoryOptimForceUpdate, bool);
   // Indicate which kind of sort algorithm is used for operators, the memory
   // optimization relays on the sort algorithm.
   DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
index ca40c01fc5..4f5c50d0d6 100644
--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
   attr->set_i(data);
 }
 template <>
+void SetAttr<bool>(framework::proto::OpDesc *op, const std::string &name,
+                   const bool &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(data);
+}
+template <>
 void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
                       const int64_t &data) {
   auto *attr = op->add_attrs();
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index de04713b53..120f6ef27d 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <sys/stat.h>
 #include <cstdio>
 #include <fstream>
+#include <set>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
@@ -29,9 +30,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/port.h"
 
 #ifdef _WIN32
+#include <direct.h>
+#include <io.h>
 #define GCC_ATTRIBUTE(attr__) ;
+#define MKDIR(path) _mkdir(path)
 #else
+#include <unistd.h>
 #define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
+#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)
 #endif
 #define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
 
@@ -163,6 +169,54 @@ static bool PathExists(const std::string &path) {
   return false;
 }
 
+static std::string GetDirRoot(const std::string &path) {
+  char sep = '/';
+
+#ifdef _WIN32
+  sep = '\\';
+#endif
+
+  size_t i = path.rfind(sep, path.length());
+  if (i != std::string::npos) {
+    return (path.substr(0, i));
+  }
+  return path;
+}
+
+static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
+  std::string opt_cache_dir = model_root + "/_opt_cache/";
+  if (!PathExists(opt_cache_dir)) {
+    PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
+                   "Can not create optimize cache directory: %s, Make sure you "
+                   "have permission to write",
+                   opt_cache_dir);
+  }
+  return opt_cache_dir;
+}
+
+static std::string GetTrtCalibPath(const std::string &model_root,
+                                   const std::string &engine_key) {
+  return model_root + "/trt_calib_" + engine_key;
+}
+
+// If there is no calib table data file in model_opt_cache_dir, return "".
+static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
+                                        const std::string &engine_key,
+                                        bool enable_int8) {
+  std::string trt_calib_table_path =
+      GetTrtCalibPath(model_opt_cache_dir, engine_key);
+  if (enable_int8 && FileExists(trt_calib_table_path)) {
+    VLOG(3) << "Calibration table file: " << trt_calib_table_path
+            << "is found here";
+    std::ifstream infile(trt_calib_table_path, std::ios::in);
+    std::stringstream buffer;
+    buffer << infile.rdbuf();
+    std::string calibration_data(buffer.str());
+    return calibration_data;
+  }
+  return "";
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 4e14642264..99611ce84b 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
       pass->Set("min_subgraph_size",
                 new int(argument->tensorrt_min_subgraph_size()));
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
+
+      bool enable_int8 = argument->tensorrt_precision_mode() ==
+                         contrib::AnalysisConfig::Precision::kInt8;
+
+      pass->Set("enable_int8", new bool(enable_int8));
+      std::string model_opt_cache_dir =
+          argument->Has("model_dir")
+              ? argument->model_dir()
+              : GetDirRoot(argument->model_program_path());
+      pass->Set(
+          "model_opt_cache_dir",
+          new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
     }
 
     // graph_ = pass->Apply(std::move(graph_));
@@ -91,11 +105,14 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
 }
 
 framework::proto::ProgramDesc IRPassManager::AcquireProgram(
-    std::unique_ptr<Graph> *graph, const ProgramDesc &program) const {
+    std::unique_ptr<Graph> *graph, ProgramDesc *program) const {
   auto pass =
       framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
 
-  ProgramDesc desc(program);
+  // Direct using ProgramDesc desc(argument->main_program()) may cause
+  // incomplete copies of information.
+  ProgramDesc desc;
+  desc.CopyFrom(*program->Proto());
   pass->SetNotOwned("program", &desc);
   auto *the_graph = graph->release();
   *graph = pass->Apply(std::unique_ptr<Graph>(the_graph));
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
index 983a582649..2a595cb36b 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -29,6 +29,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 
 namespace paddle {
 namespace inference {
@@ -42,8 +43,8 @@ class IRPassManager final {
 
   std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph);
 
-  framework::proto::ProgramDesc AcquireProgram(
-      std::unique_ptr<Graph> *graph, const ProgramDesc &program) const;
+  framework::proto::ProgramDesc AcquireProgram(std::unique_ptr<Graph> *graph,
+                                               ProgramDesc *program) const;
 
   framework::ir::Graph &graph() const { return *graph_; }
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 5f25303cc1..69a9caec03 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -67,12 +68,33 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
   return graph;
 }
 
+std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
+                              const std::set<std::string> &engine_outputs) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+
 void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
                                             Graph *graph) const {
   auto *op_desc = node->Op();
   auto &subgraph = *Agent(node).subgraph();
   PADDLE_ENFORCE(!subgraph.empty());
 
+  framework::ProgramDesc *program_desc =
+      Get<framework::ProgramDesc *>("program");
+  // Add new block for TensorRTEngineOP
+  const framework::BlockDesc &main_block =
+      program_desc->Block(framework::kRootBlockIndex);
+  // const framework::BlockDesc& main_block = program_desc->Block(0);
+  framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
+
   // An fake block desc.
   framework::proto::BlockDesc block_proto;
   framework::BlockDesc block_desc(nullptr, &block_proto);
@@ -82,13 +104,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
                           subgraph.size());
 
   for (auto *node : subgraph) {
+    auto *new_block_op = new_block->AppendOp();
     auto *op = block_desc.AppendOp();
+    *new_block_op->Proto() = *node->Op()->Proto();
     *op->Proto() = *node->Op()->Proto();
   }
 
-  // collect inputs
-  std::unordered_set<std::string> input_names;
-  std::unordered_set<std::string> input_names_with_id;
+  // Then, we will use the input_names_with_id and output_names_with_id to
+  // generate the eigine key.
+  // So, We use set instead of unordered_set here to ensure that the engine key
+  // is unique.
+  std::set<std::string> input_names;
+  std::set<std::string> input_names_with_id;
   for (auto *x : node->inputs) {
     input_names.insert(x->Name());
     input_names_with_id.insert(x->Name() + std::to_string(x->id()));
@@ -96,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   op_desc->SetInput(
       "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
-  std::unordered_set<std::string> output_names;
-  std::unordered_set<std::string> output_names_with_id;
+  std::set<std::string> output_names;
+  std::set<std::string> output_names_with_id;
   for (auto *x : node->outputs) {
     output_names.insert(x->Name());
     output_names_with_id.insert(x->Name() + std::to_string(x->id()));
@@ -182,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   // to Tensor.
   std::vector<std::string> output_mapping;
   for (auto name : output_names) {
-    // LOG(INFO) << name << " " << output_name_map.size();
     PADDLE_ENFORCE(output_name_map.count(name) != 0);
     output_mapping.push_back(output_name_map[name]);
   }
@@ -193,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
       *vars->Add() = *node->Var()->Proto();
     }
   }
+
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                  "the block has no var-desc");
   PADDLE_ENFORCE(!output_mapping.empty());
-  // Set attrs
+  op_desc->SetBlockAttr("sub_block", new_block);
   SetAttr(op_desc->Proto(), "subgraph",
           block_desc.Proto()->SerializeAsString());
+  // Set attrs
   SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
   SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
   SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+
+  auto enable_int8 = Get<bool>("enable_int8");
+  auto engine_key =
+      GenerateEngineKey(input_names_with_id, output_names_with_id);
+
+  std::string calibration_data = GetTrtCalibTableData(
+      Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
+  SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
+
+  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
+  SetAttr(op_desc->Proto(), "engine_key", engine_key);
 }
 
 std::vector<std::string> ExtractParameters(
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index 691c336ebe..9d74dc6c21 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass)
+cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor)
 cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
index f1da37af3c..6b3d80fcef 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -31,7 +31,11 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
   }
 
   std::unique_ptr<Graph> graph(argument->main_graph_ptr());
-  framework::ProgramDesc desc(argument->main_program());
+
+  // Direct using ProgramDesc desc(argument->main_program()) may cause
+  // incomplete copies of information.
+  framework::ProgramDesc desc;
+  desc.CopyFrom(*argument->main_program().Proto());
   pass->SetNotOwned("program", &desc);
   auto thegraph = pass->Apply(std::move(graph));
   thegraph.release();  // the argument still own the graph.
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 57683c0b72..3d1be9196f 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -444,6 +444,26 @@ std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
   return batch_shapes;
 }
 
+// Replace the -1 in shape to a real number to fake the shape.
+std::vector<std::map<std::string, std::vector<int>>> FakeBatchVarShapes(
+    const framework::ProgramDesc& program) {
+  std::vector<std::map<std::string, std::vector<int>>> res;
+  res.emplace_back();
+  auto& record = res.front();
+  const int fake_batch_size = 3;
+  for (auto* var : program.Block(0).AllVars()) {
+    if (var->GetType() ==
+        framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
+      auto shape = var->GetShape();
+      for (auto& v : shape) {
+        if (v < 0) v = fake_batch_size;
+      }
+      record[var->Name()].assign(shape.begin(), shape.end());
+    }
+  }
+  return res;
+}
+
 // Calculate the average dim of each tensor from the batch shape cache.
 std::unordered_map<std::string, size_t> GetBatchAverageSize(
     const std::vector<std::map<std::string, std::vector<int>>>& batches) {
@@ -478,6 +498,7 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
   std::unordered_map<std::string, std::stringstream> var_batchsize_hashes;
   for (auto& batch : batches) {
     for (auto& ele : batch) {
+      PADDLE_ENFORCE(!ele.second.empty());
       int batch_size = ele.second.front();
       // TODO(Superjomn) might consume large memory here, use combine hash.
       var_batchsize_hashes[ele.first] << batch_size;
@@ -538,9 +559,21 @@ std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
 
 std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
 
+std::pair<size_t, size_t> GetRange(
+    const std::unordered_map<std::string, size_t>& ave_size) {
+  auto res = std::make_pair(std::numeric_limits<size_t>::max(),
+                            std::numeric_limits<size_t>::min());
+  for (auto& item : ave_size) {
+    res.first = std::min(item.second, res.first);
+    res.second = std::max(item.second, res.second);
+  }
+  return res;
+}
+
 void MemoryOptimizePass::RunImpl(Argument* argument) {
   // When force update, should not optimize memory.
-  if (!argument->enable_memory_optim() || argument->memory_optim_force_update())
+  if (!argument->enable_memory_optim() ||
+      argument->static_memory_optim_force_update())
     return;
   graph_ = argument->main_graph_ptr();
 
@@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
       argument->model_program_path_valid() ? argument->model_program_path()
                                            : "");
   VLOG(3) << "Load memory cache from " << path;
-  if (inference::IsFileExists(path)) {
-    VLOG(4) << "Performing memory optimize";
-    auto batches = DeseralizeBatchVarShapes(path);
-    auto var_batch_ave_size = GetBatchAverageSize(batches);
+  std::vector<std::map<std::string, std::vector<int>>> batches;
+
+  if (argument->static_memory_optim() && inference::IsFileExists(path)) {
+    string::PrettyLogInfo("--- Performing static memory optimize");
+    batches = DeseralizeBatchVarShapes(path);
+  } else {
+    string::PrettyLogInfo("--- Performing dynamic memory optimize");
+    batches = FakeBatchVarShapes(argument->main_program());
+  }
+  auto var_batch_ave_size = GetBatchAverageSize(batches);
+
+  // Get min and max memory size.
+  const auto range = GetRange(var_batch_ave_size);
+  const int cluster_size = std::max(
+      static_cast<int>((range.second - range.first) / 100 /*cluster num*/),
+      1024);
+  const int cluster_size1 = std::max(
+      static_cast<int>((range.second - range.first) / 1000 /*cluster num*/),
+      1024);
 
-    std::unordered_map<std::string, Node*> tensor_nodes;
-    space_table_t space_table;
-    CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
+  std::unordered_map<std::string, Node*> tensor_nodes;
+  space_table_t space_table;
+  CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
 
-    std::unordered_map<std::string, std::string> reuse_table;
-    double max_saving_ratio = 0.;
+  std::unordered_map<std::string, std::string> reuse_table;
+  double max_saving_ratio = 0.;
 
-    std::vector<std::function<MemoryAllocation()>> strategies;
+  std::vector<std::function<MemoryAllocation()>> strategies;
 
-    for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
+  for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
+    if (argument->static_memory_optim()) {
+      // This strategy only make scene in static memory optimize.
       strategies.emplace_back([&, sort_kind] {
         auto clustered_vars_by_batch_size =
             AnalysisBatchShapesByBatchSize(batches);
@@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
                       space_table, &reuse_table, sort_kind, &allocation);
         return allocation;
       });
+    }
 
-      strategies.emplace_back([&, sort_kind] {
-        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
-            space_table, batches, 1024);  // interval 1kb
-        MemoryAllocation allocation;
-        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
-                      space_table, &reuse_table, sort_kind, &allocation);
-        return allocation;
-      });
+    strategies.emplace_back([&, sort_kind] {
+      auto clustered_vars_by_ave_size =
+          AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size);
+      MemoryAllocation allocation;
+      MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
+                    &reuse_table, sort_kind, &allocation);
+      return allocation;
+    });
+
+    strategies.emplace_back([&, sort_kind] {
+      auto clustered_vars_by_ave_size =
+          AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1);
+      MemoryAllocation allocation;
+      MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
+                    &reuse_table, sort_kind, &allocation);
+      return allocation;
+    });
+
+    strategies.emplace_back([&, sort_kind] {
+      auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+          space_table, batches,
+          std::numeric_limits<int>::max());  // no intervals
+      MemoryAllocation allocation;
+      MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table,
+                    &reuse_table, sort_kind, &allocation);
+      return allocation;
+    });
+  }
 
-      strategies.emplace_back([&, sort_kind] {
-        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
-            space_table, batches, 1024 * 1024);  // interval 1MB
-        MemoryAllocation allocation;
-        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
-                      space_table, &reuse_table, sort_kind, &allocation);
-        return allocation;
-      });
+  std::function<MemoryAllocation()>* best_strategy{nullptr};
 
-      strategies.emplace_back([&, sort_kind] {
-        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
-            space_table, batches,
-            std::numeric_limits<int>::max());  // no intervals
-        MemoryAllocation allocation;
-        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
-                      space_table, &reuse_table, sort_kind, &allocation);
-        return allocation;
-      });
+  // Try all strategies to get the best result.
+  for (auto& strategy : strategies) {
+    auto allocation = strategy();
+    string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
+                            allocation.GetSavingRatio());
+    if (allocation.GetSavingRatio() > max_saving_ratio) {
+      max_saving_ratio = allocation.GetSavingRatio();
+      best_strategy = &strategy;
     }
+  }
+  if (!best_strategy) {
+    LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize";
+    return;
+  }
+  auto memory_allocation = (*best_strategy)();
 
-    std::function<MemoryAllocation()>* best_strategy{nullptr};
+  string::PrettyLogInfo(
+      "--- Saved %.2f%s memory for workspace(temporary variables)",
+      memory_allocation.GetSavingRatio() * 100, "%");
 
-    // Try all strategies to get the best result.
-    for (auto& strategy : strategies) {
-      auto allocation = strategy();
-      string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
-                              allocation.GetSavingRatio());
-      if (allocation.GetSavingRatio() > max_saving_ratio) {
-        max_saving_ratio = allocation.GetSavingRatio();
-        best_strategy = &strategy;
-      }
-    }
-    if (!best_strategy) {
-      LOG(ERROR)
-          << "This model makes poor memory optimize, skip memory optimize";
-      return;
-    }
-    auto memory_allocation = (*best_strategy)();
-
-    string::PrettyLogH2(
-        "--- Saved %.2f%s memory for workspace(temporary variables)",
-        memory_allocation.GetSavingRatio() * 100, "%");
-    string::PrettyLogDetail("--- Allocated %d MB",
-                            memory_allocation.allocated / 1024. / 1024.);
-    string::PrettyLogDetail("--- Saved %d MB",
-                            memory_allocation.saved / 1024. / 1024.);
-    argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
-                               new std::unordered_set<std::string>);
-    auto& vars2remove =
-        argument->main_graph().Get<std::unordered_set<std::string>>(
-            framework::ir::kGraphToProgramVarsToRemove);
-
-    PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
-    argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
-  }
+  argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
+                             new std::unordered_set<std::string>);
+  auto& vars2remove =
+      argument->main_graph().Get<std::unordered_set<std::string>>(
+          framework::ir::kGraphToProgramVarsToRemove);
+
+  PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
+  argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
 }
 
 float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const {
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index fa1ad9c8c6..216f416de0 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index f9da3004ed..8efd514bd8 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -95,12 +95,14 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   CP_MEMBER(memory_pool_init_size_mb_);
 
   CP_MEMBER(enable_memory_optim_);
-  CP_MEMBER(memory_optim_force_update_);
+  CP_MEMBER(static_memory_optim_);
+  CP_MEMBER(static_memory_optim_force_update_);
   // TensorRT releated.
   CP_MEMBER(use_tensorrt_);
   CP_MEMBER(tensorrt_workspace_size_);
   CP_MEMBER(tensorrt_max_batchsize_);
   CP_MEMBER(tensorrt_min_subgraph_size_);
+  CP_MEMBER(tensorrt_precision_mode_);
   // MKLDNN releated.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -140,9 +142,9 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
   Update();
 }
 
-void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
-                                                   int max_batch_size,
-                                                   int min_subgraph_size) {
+void contrib::AnalysisConfig::EnableTensorRtEngine(
+    int workspace_size, int max_batch_size, int min_subgraph_size,
+    contrib::AnalysisConfig::Precision precision_mode) {
 #ifdef PADDLE_WITH_CUDA
   if (!use_gpu()) {
     LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@@ -153,6 +155,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
   tensorrt_workspace_size_ = workspace_size;
   tensorrt_max_batchsize_ = max_batch_size;
   tensorrt_min_subgraph_size_ = min_subgraph_size;
+  tensorrt_precision_mode_ = precision_mode;
 
   Update();
 #else
@@ -238,7 +241,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() {
   ss << tensorrt_min_subgraph_size_;
 
   ss << enable_memory_optim_;
-  ss << memory_optim_force_update_;
+  ss << static_memory_optim_;
+  ss << static_memory_optim_force_update_;
 
   ss << use_mkldnn_;
   for (auto &item : mkldnn_enabled_op_types_) ss << item;
@@ -278,9 +282,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }
 
-void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) {
+void contrib::AnalysisConfig::EnableMemoryOptim(
+    bool static_optim, bool force_update_static_cache) {
   enable_memory_optim_ = true;
-  memory_optim_force_update_ = force_update_cache;
+  static_memory_optim_ = static_optim;
+  static_memory_optim_force_update_ = force_update_static_cache;
 
   Update();
 }
@@ -300,4 +306,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
   Update();
 }
 
+NativeConfig contrib::AnalysisConfig::ToNativeConfig() const {
+  NativeConfig config;
+  config.model_dir = model_dir_;
+  config.prog_file = prog_file_;
+  config.param_file = params_file_;
+  config.use_gpu = use_gpu_;
+  config.device = device_id_;
+  config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
+  config.specify_input_name = specify_input_name_;
+  return config;
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2b0cad5faa..3a5f21d475 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <glog/logging.h>
 #include <algorithm>
+#include <fstream>
 #include <memory>
 #include <string>
 #include <vector>
@@ -25,6 +26,7 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -37,6 +39,8 @@
 
 #if PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
+
 #endif
 
 DECLARE_bool(profile);
@@ -44,6 +48,12 @@ DECLARE_bool(profile);
 namespace paddle {
 
 using contrib::AnalysisConfig;
+using inference::Singleton;
+#if PADDLE_WITH_TENSORRT
+using inference::tensorrt::TRTInt8Calibrator;
+using inference::tensorrt::TRTCalibratorEngine;
+using inference::tensorrt::TRTCalibratorEngineManager;
+#endif
 
 namespace {
 bool IsPersistable(const framework::VarDesc *var) {
@@ -298,15 +308,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
 bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                  framework::Scope *scope) {
   VLOG(3) << "Predictor::get_fetch";
-  outputs->resize(fetchs_.size());
-  for (size_t i = 0; i < fetchs_.size(); ++i) {
-    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
+  outputs->resize(fetches_.size());
+  for (size_t i = 0; i < fetches_.size(); ++i) {
+    int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
     PADDLE_ENFORCE((size_t)idx == i);
     framework::LoDTensor &fetch =
         framework::GetFetchVariable(*scope, "fetch", idx);
     auto type = fetch.type();
     auto output = &(outputs->at(i));
-    output->name = fetchs_[idx]->Input("X")[0];
+    output->name = fetches_[idx]->Input("X")[0];
     if (type == framework::proto::VarType::FP32) {
       GetFetchOne<float>(fetch, output);
       output->dtype = PaddleDType::FLOAT32;
@@ -327,7 +337,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   argument_.SetUseGPU(config_.use_gpu());
   argument_.SetGPUDeviceId(config_.gpu_device_id());
   argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
-  argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_);
+  argument_.SetStaticMemoryOptim(config_.static_memory_optim_);
+  argument_.SetStaticMemoryOptimForceUpdate(
+      config_.static_memory_optim_force_update_);
   argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
   if (!config_.model_dir().empty()) {
@@ -337,6 +349,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
         !config_.params_file().empty(),
         "Either model_dir or (param_file, prog_file) should be set.");
     PADDLE_ENFORCE(!config_.prog_file().empty());
+    std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
+
     argument_.SetModelProgramPath(config_.prog_file());
     argument_.SetModelParamsPath(config_.params_file());
   }
@@ -347,6 +361,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
     argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
     argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
     argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
+    argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
   }
 
   if (config_.use_mkldnn_) {
@@ -422,10 +437,10 @@ void AnalysisPredictor::PrepareFeedFetch() {
       feed_names_[op->Output("Out")[0]] = idx;
     } else if (op->Type() == "fetch") {
       int idx = boost::get<int>(op->GetAttr("col"));
-      if (fetchs_.size() <= static_cast<size_t>(idx)) {
-        fetchs_.resize(idx + 1);
+      if (fetches_.size() <= static_cast<size_t>(idx)) {
+        fetches_.resize(idx + 1);
       }
-      fetchs_[idx] = op;
+      fetches_[idx] = op;
     }
   }
 }
@@ -567,7 +582,67 @@ bool AnalysisPredictor::LoadParameters() {
   return true;
 }
 
+#if PADDLE_WITH_TENSORRT
+bool AnalysisPredictor::SaveTrtCalibToDisk() {
+  PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
+                 "This func can be invoked only in trt mode");
+  auto &block = inference_program_->Block(0);
+  for (auto &op_desc : block.AllOps()) {
+    if (op_desc->Type() == "tensorrt_engine") {
+      std::string engine_name =
+          boost::get<std::string>(op_desc->GetAttr("engine_key"));
+      if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_name)) {
+        LOG(ERROR) << "You should run the predictor(with trt) on the real data "
+                      "to generate calibration info";
+        return false;
+      }
+      TRTCalibratorEngine *calib_engine =
+          Singleton<TRTCalibratorEngineManager>::Global().Get(engine_name);
+      LOG(INFO) << "Wait for calib threads done.";
+      calib_engine->calib_->waitAndSetDone();
+      LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot "
+                   "of time...";
+      calib_engine->thr_->join();
+      std::string calibration_table_data =
+          calib_engine->calib_->getCalibrationTableAsString();
+
+      if (calibration_table_data.empty()) {
+        LOG(ERROR) << "the calibration table is empty.";
+        return false;
+      }
+
+      std::string model_opt_cache_dir =
+          argument_.Has("model_dir")
+              ? argument_.model_dir()
+              : inference::analysis::GetDirRoot(argument_.model_program_path());
+
+      std::string calibration_table_data_path =
+          inference::analysis::GetTrtCalibPath(
+              inference::analysis::GetOrCreateModelOptCacheDir(
+                  model_opt_cache_dir),
+              engine_name);
+
+      std::ofstream ofile(calibration_table_data_path, std::ios::out);
+      LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file "
+                << calibration_table_data_path;
+      ofile << calibration_table_data;
+      ofile.close();
+    }
+  }
+  // Free all calibrator resources.
+  Singleton<TRTCalibratorEngineManager>::Global().DeleteALL();
+  return true;
+}
+#endif
+
 AnalysisPredictor::~AnalysisPredictor() {
+#if PADDLE_WITH_TENSORRT
+  if (config_.tensorrt_engine_enabled() &&
+      config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
+      Singleton<TRTCalibratorEngineManager>::Global().Has()) {
+    SaveTrtCalibToDisk();
+  }
+#endif
   if (FLAGS_profile) {
     platform::DisableProfiler(platform::EventSortingKey::kTotal,
                               "./profile.log");
@@ -638,12 +713,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
   // check if the cache exists
   if (!config_.enable_memory_optim()) {
     need = false;
-  } else if (config_.enable_memory_optim() &&
+  } else if (config_.static_memory_optim_ &&
              !inference::IsFileExists(inference::analysis::GetMemoryCachePath(
                  config_.model_dir(), config_.prog_file()))) {
     need = true;
-  } else if (config_.enable_memory_optim() &&
-             config_.memory_optim_force_update_) {
+  } else if (config_.static_memory_optim_ &&
+             config_.static_memory_optim_force_update_) {
     need = true;
   }
 
@@ -651,6 +726,10 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
   return need;
 }
 
+std::string AnalysisPredictor::GetSeriazlizedProgram() const {
+  return inference_program_->Proto()->SerializeAsString();
+}
+
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
     const contrib::AnalysisConfig &config) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index e25b5a7047..fa1d0d596d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -45,6 +45,7 @@ using contrib::AnalysisConfig;
 class AnalysisPredictor : public PaddlePredictor {
  public:
   explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
+  ~AnalysisPredictor();
 
   bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
             const std::shared_ptr<framework::ProgramDesc> &program = nullptr);
@@ -74,6 +75,8 @@ class AnalysisPredictor : public PaddlePredictor {
 
   void SetMkldnnThreadID(int tid);
 
+  std::string GetSeriazlizedProgram() const override;
+
  protected:
   // For memory optimization.
   bool need_collect_var_shapes_for_memory_optim();
@@ -95,7 +98,21 @@ class AnalysisPredictor : public PaddlePredictor {
   template <typename T>
   void GetFetchOne(const framework::LoDTensor &fetchs,
                    PaddleTensor *output_data);
-  ~AnalysisPredictor();
+
+#if PADDLE_WITH_TENSORRT
+  // When we use Paddle-TRT INT8 engine, we need to generate calibration table
+  // data first,
+  // the calibration table contains the range for each op's input and output,
+  // this whole process can be divided into several steps:
+  //
+  // 1. Builds a 32-bit engine, runs it on the calibration set, and records a
+  // histogram for each
+  // tensor of the distribution of activation values.
+  // 2. Builds a calibration table from the histograms.
+  //
+  // After step 2, we need to store the calibration table on disk
+  bool SaveTrtCalibToDisk();
+#endif
 
 // Some more detailed tests, they are made the friends of the predictor, so that
 // the all the details can be tested.
@@ -115,7 +132,7 @@ class AnalysisPredictor : public PaddlePredictor {
   std::shared_ptr<framework::ProgramDesc> inference_program_;
   std::vector<framework::OpDesc *> feeds_;
   std::map<std::string, size_t> feed_names_;
-  std::vector<framework::OpDesc *> fetchs_;
+  std::vector<framework::OpDesc *> fetches_;
   // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
   // concurrency problems, wrong results and memory leak, so cache them.
   std::vector<framework::LoDTensor> feed_tensors_;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 4688e93d71..20b61344da 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -215,6 +215,8 @@ TEST(AnalysisPredictor, memory_optim) {
   {
     // The first predictor help to cache the memory optimize strategy.
     auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+    LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
+    ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
 
     // Run several times to check the parameters are not reused by mistake.
     for (int i = 0; i < 5; i++) {
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 9be059c73e..6cd18277d6 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sstream>
+#include "paddle/fluid/framework/commit.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -97,4 +99,12 @@ void PaddleBuf::Free() {
   }
 }
 
+std::string get_version() {
+  std::stringstream ss;
+  ss << "version: " << framework::paddle_version() << "\n";
+  ss << "commit: " << framework::paddle_commit() << "\n";
+  ss << "branch: " << framework::paddle_compile_branch() << "\n";
+  return ss.str();
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc
index 7a579610ee..2c450ef7ce 100644
--- a/paddle/fluid/inference/api/api_tester.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
@@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) {
   predictor->Run({}, &outputs);
 }
 
+TEST(paddle_inference_api, get_version) {
+  LOG(INFO) << "paddle version:\n" << get_version();
+  auto version = get_version();
+  ASSERT_FALSE(version.empty());
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 1cee890450..5b899b26d6 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -42,6 +42,10 @@ struct AnalysisConfig {
   explicit AnalysisConfig(const std::string& model_dir);
   explicit AnalysisConfig(const std::string& prog_file,
                           const std::string& params_file);
+  enum class Precision {
+    kFloat32 = 0,
+    kInt8,
+  };
 
   /** Set model with a directory.
    */
@@ -135,7 +139,8 @@ struct AnalysisConfig {
    * subgraph is less than this, it will not transfer to TensorRT engine.
    */
   void EnableTensorRtEngine(int workspace_size = 1 << 20,
-                            int max_batch_size = 1, int min_subgraph_size = 3);
+                            int max_batch_size = 1, int min_subgraph_size = 3,
+                            Precision precision = Precision::kFloat32);
   /** A boolean state telling whether the TensorRT engine is used.
    */
   bool tensorrt_engine_enabled() const { return use_tensorrt_; }
@@ -162,17 +167,7 @@ struct AnalysisConfig {
 
   /** Transform the AnalysisConfig to NativeConfig.
    */
-  NativeConfig ToNativeConfig() const {
-    NativeConfig config;
-    config.model_dir = model_dir_;
-    config.prog_file = prog_file_;
-    config.param_file = params_file_;
-    config.use_gpu = use_gpu_;
-    config.device = device_id_;
-    config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
-    config.specify_input_name = specify_input_name_;
-    return config;
-  }
+  NativeConfig ToNativeConfig() const;
   /** Specify the operator type list to use MKLDNN acceleration.
    * @param op_list the operator type list.
    */
@@ -195,7 +190,8 @@ struct AnalysisConfig {
   /** Turn on memory optimize
    * NOTE still in development, will release latter.
    */
-  void EnableMemoryOptim(bool force_update_cache = false);
+  void EnableMemoryOptim(bool static_optim = false,
+                         bool force_update_static_cache = false);
   /** Tell whether the memory optimization is activated. */
   bool enable_memory_optim() const;
 
@@ -238,10 +234,12 @@ struct AnalysisConfig {
   //  We set this variable to control the minimum number of nodes in the
   //  subgraph, 3 as default value.
   int tensorrt_min_subgraph_size_{3};
+  Precision tensorrt_precision_mode_;
 
   // memory reuse related.
   bool enable_memory_optim_{false};
-  bool memory_optim_force_update_{false};
+  bool static_memory_optim_{false};
+  bool static_memory_optim_force_update_{false};
 
   bool use_mkldnn_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 46b510fd1e..4069832246 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -215,6 +215,14 @@ class PaddlePredictor {
    */
   virtual ~PaddlePredictor() = default;
 
+  /** \brief Get the serialized model program that executes in inference phase.
+   * Its data type is ProgramDesc, which is a protobuf message.
+   */
+  virtual std::string GetSeriazlizedProgram() const {
+    assert(false);  // Force raise error.
+    return "NotImplemented";
+  };
+
   /** The common configs for all the predictors.
    */
   struct Config {
@@ -288,4 +296,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 
 int PaddleDtypeSize(PaddleDType dtype);
 
+std::string get_version();
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index d3a60d2099..391932a1ee 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -154,13 +154,16 @@ class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
     passes_.assign({
-        "infer_clean_graph_pass",                    //
-        "conv_affine_channel_fuse_pass",             //
-        "conv_eltwiseadd_affine_channel_fuse_pass",  //
-        "conv_bn_fuse_pass",                         //
-        "conv_elementwise_add_act_fuse_pass",        //
-        "conv_elementwise_add2_act_fuse_pass",       //
-        "conv_elementwise_add_fuse_pass",            //
+      "infer_clean_graph_pass",                        //
+          "conv_affine_channel_fuse_pass",             //
+          "conv_eltwiseadd_affine_channel_fuse_pass",  //
+          "conv_bn_fuse_pass",                         //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+          "conv_elementwise_add_act_fuse_pass",   //
+          "conv_elementwise_add2_act_fuse_pass",  //
+          "conv_elementwise_add_fuse_pass",       //
+#endif
     });
 
     for (int i = 6; i >= 3; i--) {
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index 9afeafd176..f4977d08c4 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,4 @@
-nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
+nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
 nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 01d7f700da..c5a413221e 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -29,9 +29,9 @@ TEST(OpConverter, ConvertBlock) {
   // init trt engine
   cudaStream_t stream_;
   std::unique_ptr<TensorRTEngine> engine_;
-  engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
-  engine_->InitNetwork();
   PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+  engine_.reset(new TensorRTEngine(5, 1 << 15, stream_));
+  engine_->InitNetwork();
 
   engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
                         nvinfer1::Dims3(2, 5, 5));
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index f313beb73b..e83961f3d7 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -78,11 +78,9 @@ class TRTConvertValidation {
         scope_(scope),
         if_add_batch_(if_add_batch),
         max_batch_size_(max_batch_size) {
-    // create engine.
-    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
-    engine_->InitNetwork();
-
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_));
+    engine_->InitNetwork();
   }
 
   // Declare a Variable as input with random initialization.
@@ -175,7 +173,7 @@ class TRTConvertValidation {
     op_->Run(scope_, place);
     // Execute TRT.
     engine_->Execute(batch_size);
-    cudaStreamSynchronize(*engine_->stream());
+    cudaStreamSynchronize(engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
     const size_t output_space_size = 3000;
@@ -184,7 +182,7 @@ class TRTConvertValidation {
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
       engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
-      cudaStreamSynchronize(*engine_->stream());
+      cudaStreamSynchronize(engine_->stream());
 
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index f739752cbc..10f48462cf 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -42,14 +42,13 @@ void TensorRTEngine::Execute(int batch_size) {
     PADDLE_ENFORCE(buf.device == DeviceType::GPU);
     buffers.push_back(buf.buffer);
   }
-  PADDLE_ENFORCE_NOT_NULL(stream_);
-  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
-  cudaStreamSynchronize(*stream_);
+  infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
+  cudaStreamSynchronize(stream_);
   SetRuntimeBatch(batch_size);
 }
 
 TensorRTEngine::~TensorRTEngine() {
-  cudaStreamSynchronize(*stream_);
+  cudaStreamSynchronize(stream_);
   // clean buffer
   for (auto &buf : buffers_) {
     if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
@@ -70,6 +69,13 @@ void TensorRTEngine::FreezeNetwork() {
   // build engine.
   infer_builder_->setMaxBatchSize(max_batch_);
   infer_builder_->setMaxWorkspaceSize(max_workspace_);
+  if (enable_int8_) {
+    infer_builder_->setInt8Mode(true);
+    PADDLE_ENFORCE(
+        calibrator_ != nullptr,
+        "The precision mode is 'INT8', the calibrator should not be nullptr");
+    infer_builder_->setInt8Calibrator(calibrator_);
+  }
 
   infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
   PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
@@ -173,7 +179,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
   PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                    cudaMemcpyDeviceToDevice, *stream_),
+                                    cudaMemcpyDeviceToDevice, stream_),
                     0);
 }
 
@@ -194,7 +200,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                       cudaMemcpyDeviceToHost, *stream_));
+                                       cudaMemcpyDeviceToHost, stream_));
 }
 
 Buffer &TensorRTEngine::buffer(const std::string &name) {
@@ -211,12 +217,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
   PADDLE_ENFORCE_NOT_NULL(data);
-  PADDLE_ENFORCE_NOT_NULL(stream_);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
   buf.size = size;
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyHostToDevice, *stream_));
+                                       cudaMemcpyHostToDevice, stream_));
 }
 
 void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
@@ -227,7 +232,7 @@ void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyDeviceToDevice, *stream_));
+                                       cudaMemcpyDeviceToDevice, stream_));
 }
 
 void TensorRTEngine::SetITensor(const std::string &name,
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index f5b2c28ba9..cdfe09b5a7 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -23,12 +23,14 @@ limitations under the License. */
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+class TRTInt8Calibrator;
 /*
  * TensorRT Engine.
  *
@@ -54,17 +56,17 @@ class TensorRTEngine : public EngineBase {
     nvinfer1::Weights w_;
   };
 
-  TensorRTEngine(int max_batch, int max_workspace,
-                 cudaStream_t* stream = nullptr, int device = 0,
+  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
+                 int device = 0, bool enable_int8 = false,
+                 TRTInt8Calibrator* calibrator = nullptr,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
-        stream_(stream ? stream : &default_stream_),
-        logger_(logger),
-        device_(device) {
-    freshDeviceId();
-    cudaStreamCreate(stream_);
-  }
+        stream_(stream),
+        device_(device),
+        enable_int8_(enable_int8),
+        calibrator_(calibrator),
+        logger_(logger) {}
 
   virtual ~TensorRTEngine();
 
@@ -102,7 +104,7 @@ class TensorRTEngine : public EngineBase {
   // NOTE this should be used after calling `FreezeNetwork`.
   Buffer& buffer(const std::string& name) override;
 
-  cudaStream_t* stream() { return stream_; }
+  cudaStream_t stream() { return stream_; }
 
   // Fill an input from CPU memory with name and size.
   void SetInputFromCPU(const std::string& name, const void* data, size_t size);
@@ -142,8 +144,8 @@ class TensorRTEngine : public EngineBase {
   // In the normal case, the paddle-trt exists bug when runing the googlenet.
   // When there are more than two convolutions of 1 * 1 with the same input, the
   // paddle-tensorrt will do the merging optimization, which fuse those conv
-  // into
-  // one conv, and then trigger bug. So,  We should use strategy to avoid this
+  // into one conv, and then trigger bug. So,  We should use strategy to avoid
+  // this
   // optimization for the time being. This bug will be fixed in the future.
   std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
       itensor_quote_num;
@@ -156,11 +158,15 @@ class TensorRTEngine : public EngineBase {
   // the max memory size the engine uses
   int max_workspace_;
 
+  cudaStream_t stream_;
+  // The specific GPU id that the TensorRTEngine bounded to.
+  int device_;
+
+  bool enable_int8_;
+  TRTInt8Calibrator* calibrator_;
   // batch size of the current data, will be updated each Executation.
   int batch_size_{-1};
-  cudaStream_t* stream_;
-  // If stream_ is not set from outside, hold its own stream.
-  cudaStream_t default_stream_;
+
   nvinfer1::ILogger& logger_;
 
   std::vector<Buffer> buffers_;
@@ -169,8 +175,6 @@ class TensorRTEngine : public EngineBase {
   std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
       itensor_map_;
 
-  // The specific GPU id that the TensorRTEngine bounded to.
-  int device_;
   std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
 
   // TensorRT related internal members
@@ -208,38 +212,6 @@ class TensorRTEngine : public EngineBase {
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
   engine__->network()->add##layer__(ARGS);
 
-/*
- * Helper to control the TensorRT engine's creation and deletion.
- */
-class TRT_EngineManager {
- public:
-  bool HasEngine(const std::string& name) const {
-    return engines_.count(name) != 0;
-  }
-
-  // Get an engine called `name`.
-  TensorRTEngine* Get(const std::string& name) const {
-    return engines_.at(name).get();
-  }
-
-  // Create or get an engine called `name`
-  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
-                         const std::string& name, int gpu_device = 0) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device);
-    engines_[name].reset(p);
-    return p;
-  }
-
-  void DeleteALl() {
-    for (auto& item : engines_) {
-      item.second.reset(nullptr);
-    }
-  }
-
- private:
-  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
-};
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index da1f6535cb..9eed0f6ee9 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,8 +27,8 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    // ASSERT_EQ(0, cudaStreamCreate(&stream_));
-    engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
+    ASSERT_EQ(0, cudaStreamCreate(&stream_));
+    engine_ = new TensorRTEngine(10, 1 << 10, stream_);
     engine_->InitNetwork();
   }
 
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
new file mode 100644
index 0000000000..4a85c8b8fe
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
+#include "glog/logging.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// set the batch size before constructing the thread to execute engine
+int TRTInt8Calibrator::getBatchSize() const { return batch_size_; }
+
+TRTInt8Calibrator::TRTInt8Calibrator(
+    const std::unordered_map<std::string, size_t>& buffers, int batch_size,
+    std::string engine_name, const platform::Place place)
+    : batch_size_(batch_size), engine_name_(engine_name) {
+  int i = 0;
+  VLOG(4) << "Init a new calibrator: " << engine_name_;
+  for (const auto it : buffers) {
+    framework::Tensor temp_tensor;
+    std::string input_name = it.first;
+    int data_size = it.second;
+    int num_ele = data_size / sizeof(int16_t);
+    framework::DDim data_shape = framework::make_ddim({num_ele});
+    temp_tensor.Resize(data_shape);
+    data_tensors_.push_back(temp_tensor);
+    data_buffers_[input_name] = std::pair<void*, size_t>(
+        static_cast<void*>(temp_tensor.mutable_data<int16_t>(place)), num_ele);
+    i += 1;
+  }
+}
+
+TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data)
+    : batch_size_(0),
+      calib_running_(false),
+      data_is_set_(false),
+      done_(true),
+      calibration_table_(calib_data) {}
+
+void TRTInt8Calibrator::waitAndSetDone() {
+  std::unique_lock<std::mutex> lk(mut_);
+  while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk);
+  if (!done_) {
+    done_ = true;
+    cond_.notify_all();
+  }
+}
+
+// There might be more than one input for trt subgraph,
+// So, we use a map to store input information.
+bool TRTInt8Calibrator::setBatch(
+    const std::unordered_map<std::string, void*>& data) {
+  VLOG(3) << "set batch: " << engine_name_;
+  std::unique_lock<std::mutex> lk(mut_);
+  //  There is a producer and a consumer. The producer set the batch data and
+  //  the consumer get the batch data. The size of the data pool is one.
+  //  So, the producer has to wait for the consumer to finish processing before
+  //  they can set the data.
+  while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk);
+  // The done_ is set to true using waitAndSetDone, When all calibration data
+  // are processed.
+  if (done_) return false;
+
+  // Sets the batch.
+  for (const auto& it : data) {
+    auto dataptr = data_buffers_.find(it.first);
+    if (dataptr == data_buffers_.end()) {
+      LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first
+                 << "' does not match with the buffer names";
+    }
+    const auto& d = dataptr->second;
+    PADDLE_ENFORCE(
+        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
+        "Fail to cudaMemcpy %s for %s", engine_name_, it.first);
+  }
+
+  data_is_set_ = true;
+  cond_.notify_all();
+  return true;
+}
+
+bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
+                                 int num_bindings) {
+  VLOG(4) << "get batch: " << engine_name_;
+  std::unique_lock<std::mutex> lk(mut_);
+  // The consumer has just finished processing a data.
+  // The producer can set the data again.
+  calib_running_ = false;
+  cond_.notify_all();
+
+  // As long as there is data in the pool, the consumer can get it.
+  while (!data_is_set_ && !done_) cond_.wait(lk);
+  if (done_) return false;
+
+  // Gets the batch
+  for (int i = 0; i < num_bindings; i++) {
+    auto it = data_buffers_.find(names[i]);
+    if (it == data_buffers_.end()) {
+      LOG(FATAL) << "Calibration engine asked for unknown tensor name '"
+                 << names[i] << "' at position " << i;
+    }
+    bindings[i] = it->second.first;
+  }
+
+  data_is_set_ = false;
+  calib_running_ = true;
+  VLOG(4) << "get batch done: " << engine_name_;
+  return true;
+}
+
+void TRTInt8Calibrator::setDone() {
+  std::unique_lock<std::mutex> lk(mut_);
+  done_ = true;
+  cond_.notify_all();
+}
+
+const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) {
+  if (calibration_table_.empty()) return nullptr;
+  length = calibration_table_.size();
+  return calibration_table_.data();
+}
+
+void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
+                                              std::size_t length) {
+  calibration_table_ = std::string((const char*)ptr, length);
+  VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr
+          << " length=" << length;
+}
+TRTInt8Calibrator::~TRTInt8Calibrator() {
+  VLOG(4) << "Destroying calibrator for " << engine_name_;
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
new file mode 100644
index 0000000000..919f5d55f8
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <NvInfer.h>
+#include <cuda_runtime_api.h>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class TensorRTEngine;
+
+struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
+ public:
+  TRTInt8Calibrator(const std::unordered_map<std::string, size_t>& buffers,
+                    int batch_size, std::string engine_name,
+                    const platform::Place place);
+
+  explicit TRTInt8Calibrator(const std::string& calibration_data);
+  ~TRTInt8Calibrator();
+
+  int getBatchSize() const override;
+
+  bool getBatch(void* bindings[], const char* names[],
+                int num_bindings) override;
+
+  bool setBatch(const std::unordered_map<std::string, void*>& data);
+  void setDone();
+  void waitAndSetDone();
+
+  const void* readCalibrationCache(std::size_t& length) override;
+  void writeCalibrationCache(const void* ptr, std::size_t length) override;
+  const std::string& getCalibrationTableAsString() {
+    return calibration_table_;
+  }
+
+ private:
+  const int batch_size_;
+
+  bool calib_running_{true};
+  bool data_is_set_{false};
+  bool done_{false};
+
+  std::mutex mut_;
+  std::condition_variable cond_;
+
+  std::unordered_map<std::string, std::pair<void*, size_t>> data_buffers_;
+  std::vector<framework::Tensor> data_tensors_;
+
+  std::string engine_name_;
+  std::string calibration_table_;
+};
+
+class TRTCalibratorEngine {
+ public:
+  TRTCalibratorEngine() {}
+  std::unique_ptr<TRTInt8Calibrator> calib_;
+  std::unique_ptr<std::thread> thr_;
+  std::unique_ptr<TensorRTEngine> engine_;
+};
+/*
+ * Manager to control the TensorRT Int8 calibration creation and deltetion.
+ */
+class TRTCalibratorEngineManager {
+ public:
+  bool Has() const { return res_.size() > 0; }
+  bool Has(const std::string& name) const {
+    if (res_.count(name) == 0) return false;
+    return res_.at(name).get() != nullptr;
+  }
+
+  // Get Int8Calibrator via name
+  TRTCalibratorEngine* Get(const std::string& name) const {
+    return res_.at(name).get();
+  }
+
+  // Look up or create a calibrator.
+  TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) {
+    if (res_.count(engine_name) == 0) {
+      auto* p = new TRTCalibratorEngine;
+      res_[engine_name].reset(p);
+    }
+    return res_.at(engine_name).get();
+  }
+
+  // Create an Int8Calibrator
+  TRTCalibratorEngine* Create(const std::string& engine_name) {
+    auto* p = new TRTCalibratorEngine;
+    res_[engine_name].reset(p);
+    return p;
+  }
+
+  void DeleteALL() {
+    for (auto& item : res_) {
+      item.second.reset(nullptr);
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<TRTCalibratorEngine>> res_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 423c39813f..07b9e0e051 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -54,6 +54,7 @@ else()
     message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1")
 endif()
 
+
 # RNN2
 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
@@ -115,6 +116,10 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 
+# googlenet
+inference_analysis_api_test_with_fake_data(test_analyzer_googlenet
+  "${INFERENCE_DEMO_INSTALL_DIR}/googlenet" analyzer_resnet50_tester.cc "googlenet.tar.gz" SERIAL)
+
 # resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
   "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 4ec9404ab4..e78ab942d1 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig with memory optimization.
-TEST(Analyzer_dam, compare_with_memory_optim) {
+TEST(Analyzer_dam, compare_with_static_memory_optim) {
   // The small dam will core in CI, but works in local.
   if (FLAGS_max_turn_num == 9) {
     contrib::AnalysisConfig cfg, cfg1;
@@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
     SetInput(&input_slots_all);
     // Run the first time to force to update memory cache
     SetConfig(&cfg);
-    cfg.EnableMemoryOptim(true);
+    cfg.EnableMemoryOptim(true, true /*force update*/);
 
     CompareNativeAndAnalysis(
         reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
@@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
 
     // Run second time to use the memory cache and perform memory optimization.
     SetConfig(&cfg1);
-    cfg1.EnableMemoryOptim();
+    cfg1.EnableMemoryOptim(true, false /*do not force update*/);
 
     CompareNativeAndAnalysis(
         reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
@@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) {
   }
 }
 
+TEST(Analyzer_dam, compare_with_dynamic_memory_optim) {
+  // The small dam will core in CI, but works in local.
+  if (FLAGS_max_turn_num == 9) {
+    contrib::AnalysisConfig cfg, cfg1;
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    SetInput(&input_slots_all);
+    // Run the first time to force to update memory cache
+    SetConfig(&cfg);
+    cfg.EnableMemoryOptim();
+
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+        input_slots_all);
+  }
+}
+
 TEST(Analyzer_dam, compare) { compare(); }
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index d2ca1d0b00..b1f7a3464a 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -56,6 +56,13 @@ DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace inference {
 
+float Random(float low, float high) {
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(low, high);
+  return dist(mt);
+}
+
 void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
   const auto *analysis_config =
       reinterpret_cast<const contrib::AnalysisConfig *>(config);
@@ -176,7 +183,7 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
     float *input_data = static_cast<float *>(input.data.data());
     // fill input data, for profile easily, do not use random data here.
     for (size_t j = 0; j < len; ++j) {
-      *(input_data + j) = static_cast<float>(j) / len;
+      *(input_data + j) = Random(0.0, 1.0) / 10.;
     }
   }
   (*inputs).emplace_back(input_slots);
@@ -344,6 +351,16 @@ void CompareNativeAndAnalysis(
   CompareResult(analysis_outputs, native_outputs);
 }
 
+void CompareNativeAndAnalysis(
+    PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
+    const std::vector<std::vector<PaddleTensor>> &inputs) {
+  int batch_size = FLAGS_batch_size;
+  std::vector<PaddleTensor> native_outputs, analysis_outputs;
+  native_pred->Run(inputs[0], &native_outputs, batch_size);
+  analysis_pred->Run(inputs[0], &analysis_outputs, batch_size);
+  CompareResult(analysis_outputs, native_outputs);
+}
+
 template <typename T>
 std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
   std::stringstream ss;
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 5aca807ee3..db7109b750 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -107,6 +107,27 @@ void compare(std::string model_dir, bool use_tensorrt) {
       inputs_all);
 }
 
+void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
+  contrib::AnalysisConfig analysis_config;
+  SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
+                                     use_tensorrt, FLAGS_batch_size);
+  auto config =
+      reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config);
+  auto native_pred = CreateTestPredictor(config, false);
+  auto analysis_pred = CreateTestPredictor(config, true);
+  for (int i = 0; i < 100; i++) {
+    std::vector<std::vector<PaddleTensor>> inputs_all;
+    if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+      SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
+                        FLAGS_param_filename);
+    } else {
+      SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+    }
+    CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(),
+                             inputs_all);
+  }
+}
+
 TEST(TensorRT_mobilenet, compare) {
   std::string model_dir = FLAGS_infer_model + "/mobilenet";
   compare(model_dir, /* use_tensorrt */ true);
@@ -162,5 +183,15 @@ TEST(TensorRT_mobilenet, profile) {
   profile(model_dir, true, false);
 }
 
+TEST(resnet50, compare_continuous_input) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare_continuous_input(model_dir, true);
+}
+
+TEST(resnet50, compare_continuous_input_native) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare_continuous_input(model_dir, false);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 64aa63ffe9..5d8684f083 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include <string>
+#include <utility>
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
@@ -37,7 +38,7 @@ template <typename Place>
 void *Alloc(const Place &place, size_t size);
 
 template <typename Place>
-void Free(const Place &place, void *p);
+void Free(const Place &place, void *p, size_t size);
 
 template <typename Place>
 size_t Used(const Place &place);
@@ -52,6 +53,11 @@ size_t memory_usage(const platform::Place &p);
 
 using BuddyAllocator = detail::BuddyAllocator;
 
+std::unordered_map</*device id*/ int,
+                   std::pair</*current memory usage*/ uint64_t,
+                             /*peak memory usage*/ uint64_t>>
+    gpu_mem_info;
+
 BuddyAllocator *GetCPUBuddyAllocator() {
   // We tried thread_local for inference::RNN1 model, but that not works much
   // for multi-thread test.
@@ -98,7 +104,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) {
+void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
+                              size_t size) {
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
 }
@@ -177,9 +184,16 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
     LOG(WARNING) << "GPU memory used: "
                  << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
     platform::SetDeviceId(cur_dev);
-  }
-  if (FLAGS_init_allocated_mem) {
-    cudaMemset(ptr, 0xEF, size);
+  } else {
+    gpu_mem_info[place.device].first += size;
+    if (gpu_mem_info[place.device].first > gpu_mem_info[place.device].second) {
+      gpu_mem_info[place.device].second = gpu_mem_info[place.device].first;
+      VLOG(3) << "device: " << place.device << " peak memory usage : "
+              << (gpu_mem_info[place.device].second >> 20) << " MiB";
+    }
+    if (FLAGS_init_allocated_mem) {
+      cudaMemset(ptr, 0xEF, size);
+    }
   }
   return ptr;
 #else
@@ -188,9 +202,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 }
 
 template <>
-void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p) {
+void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
+                               size_t size) {
 #ifdef PADDLE_WITH_CUDA
   GetGPUBuddyAllocator(place.device)->Free(p);
+  gpu_mem_info[place.device].first -= size;
 #else
   PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
@@ -243,7 +259,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 
 template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
-                                     void *p) {
+                                     void *p, size_t size) {
 #ifdef PADDLE_WITH_CUDA
   GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -264,15 +280,17 @@ struct AllocVisitor : public boost::static_visitor<void *> {
 };
 
 struct FreeVisitor : public boost::static_visitor<void> {
-  inline explicit FreeVisitor(void *ptr) : ptr_(ptr) {}
+  inline explicit FreeVisitor(void *ptr, size_t size)
+      : ptr_(ptr), size_(size) {}
 
   template <typename Place>
   inline void operator()(const Place &place) const {
-    Free<Place>(place, ptr_);
+    Free<Place>(place, ptr_, size_);
   }
 
  private:
   void *ptr_;
+  size_t size_;
 };
 
 size_t Usage::operator()(const platform::CPUPlace &cpu) const {
@@ -304,8 +322,9 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
 }
 
 void LegacyAllocator::Free(Allocation *allocation) {
-  boost::apply_visitor(legacy::FreeVisitor(allocation->ptr()),
-                       allocation->place());
+  boost::apply_visitor(
+      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
+      allocation->place());
   delete allocation;
 }
 }  // namespace allocation
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 992a2bdd5a..e099425b94 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -13,6 +13,7 @@ add_subdirectory(detection)
 add_subdirectory(elementwise)
 add_subdirectory(fused)
 add_subdirectory(metrics)
+add_subdirectory(ngraph)
 add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
@@ -66,7 +67,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
 endif()
@@ -86,7 +87,6 @@ set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
-cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 2bebdb345a..c054fdb1ba 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -83,7 +83,7 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(
     T* dbias) {
   const int outer_size = C;
   const int inner_size = N * HxW;
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage ds_storage;
   __shared__ typename BlockReduce::TempStorage db_storage;
 
@@ -97,13 +97,16 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(
       ds_sum += dy[index] * x[index];
       db_sum += dy[index];
     }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    __syncthreads();
+    auto ds_out =
+        BlockReduce(ds_storage).Reduce(static_cast<double>(ds_sum), cub::Sum());
+    auto db_out =
+        BlockReduce(db_storage).Reduce(static_cast<double>(db_sum), cub::Sum());
+    __syncthreads();
     if (threadIdx.x == 0) {
-      dscale[i] = ds_sum;
-      dbias[i] = db_sum;
+      dscale[i] = ds_out;
+      dbias[i] = db_out;
     }
-    __syncthreads();
   }
 }
 
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 30f700f1d9..e78ecc1a12 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -12,205 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-#include <map>
+#include "paddle/fluid/operators/beam_search_op.h"
+
 #include <string>
 #include <vector>
-
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/beam_search_op.h"
 
 namespace paddle {
 namespace operators {
 
-void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
-                            const framework::LoDTensor &pre_scores,
-                            framework::LoDTensor *selected_ids,
-                            framework::LoDTensor *selected_scores) {
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
-
-  auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
-  auto selected_items = ToMap(items, high_level.back());
-  VLOG(3) << "selected_items:";
-  for (size_t i = 0; i < selected_items.size(); ++i) {
-    VLOG(3) << "offset:" << i;
-    for (auto &item : selected_items[i]) {
-      VLOG(3) << ItemToString(item);
-    }
-  }
-
-  PruneEndBeams(pre_ids, &selected_items);
-  // calculate the output tensor's height
-  size_t num_instances = std::accumulate(
-      std::begin(selected_items), std::end(selected_items), 0,
-      [](size_t a, std::vector<Item> &b) { return a + b.size(); });
-  // the output tensor shape should be [num_instances, 1]
-  auto dims = framework::make_ddim(
-      std::vector<int64_t>({static_cast<int>(num_instances), 1}));
-  selected_ids->Resize(dims);
-  selected_scores->Resize(dims);
-
-  std::map<size_t /*offset*/, std::vector<Item>> hash;
-  framework::LoD new_lod;
-  auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
-  auto *scores_data =
-      selected_scores->mutable_data<float>(platform::CPUPlace());
-
-  // fill in data
-  std::vector<size_t> low_level;
-  size_t low_offset = 0;
-  for (auto &items : selected_items) {
-    low_level.push_back(low_offset);
-    for (auto &item : items) {
-      ids_data[low_offset] = item.id;
-      scores_data[low_offset] = item.score;
-      low_offset++;
-    }
-  }
-  low_level.push_back(low_offset);
-
-  // fill lod
-  framework::LoD lod(2);
-  lod[0].assign(high_level.begin(), high_level.end());
-  lod[1].assign(low_level.begin(), low_level.end());
-  if (!framework::CheckLoD(lod)) {
-    PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
-  }
-  selected_ids->set_lod(lod);
-  selected_scores->set_lod(lod);
-}
-
-void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
-                               std::vector<std::vector<Item>> *items) {
-  auto *pre_ids_data = pre_ids.data<int64_t>();
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
-  for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
-    size_t src_prefix_start = high_level[src_idx];
-    size_t src_prefix_end = high_level[src_idx + 1];
-    bool finish_flag = true;
-    for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
-      for (auto &item : items->at(offset)) {
-        if (item.id != static_cast<size_t>(end_id_) ||
-            pre_ids_data[offset] != end_id_) {
-          finish_flag = false;
-          break;
-        }
-      }
-      if (!finish_flag) break;
-    }
-    if (finish_flag) {  // all branchs of the beam (source sentence) end and
-                        // prune this beam
-      for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
-        items->at(offset).clear();
-    }
-  }
-}
-
-std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
-    const std::vector<std::vector<Item>> &items, size_t element_num) {
-  std::vector<std::vector<Item>> result;
-  result.resize(element_num);
-  for (auto &entries : items) {
-    for (const auto &item : entries) {
-      result[item.offset].push_back(item);
-    }
-  }
-  return result;
-}
-
-std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
-    const framework::LoDTensor &pre_ids,
-    const framework::LoDTensor &pre_scores) {
-  std::vector<std::vector<Item>> result;
-  std::vector<Item> items;
-  // for each source sentence, select the top beam_size items across all
-  // candidate sets.
-  while (NextItemSet(pre_ids, pre_scores, &items)) {
-    std::nth_element(
-        std::begin(items), std::begin(items) + beam_size_, std::end(items),
-        [](const Item &a, const Item &b) { return a.score > b.score; });
-    // prune the top beam_size items.
-    if (items.size() > beam_size_) {
-      items.resize(beam_size_);
-    }
-    result.emplace_back(items);
-  }
-  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
-  for (auto &items : result) {
-    VLOG(3) << "item set:";
-    for (auto &item : items) {
-      VLOG(3) << ItemToString(item);
-    }
-  }
-
-  return result;
-}
-
-// the candidates of a source
-bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
-                             const framework::LoDTensor &pre_scores,
-                             std::vector<BeamSearch::Item> *items) {
-  if (sent_offset_ >= ids_->NumElements(lod_level_)) {
-    return false;
-  }
-  // find the current candidates
-  auto ids = *ids_;
-  auto scores = *scores_;
-
-  auto abs_lod = framework::ToAbsOffset(ids.lod());
-
-  auto *ids_data = ids.data<int64_t>();
-  auto *scores_data = scores.data<float>();
-
-  size_t instance_dim = 1;
-  for (int i = 1; i < ids.dims().size(); i++) {
-    instance_dim *= ids.dims()[i];
-  }
-
-  auto *pre_ids_data = pre_ids.data<int64_t>();
-  auto *pre_scores_data = pre_scores.data<float>();
-  items->clear();
-  items->reserve(framework::product(ids.dims()));
-  for (size_t offset = abs_lod[lod_level_][sent_offset_];
-       offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    auto pre_id = pre_ids_data[offset];
-    auto pre_score = pre_scores_data[offset];
-    if (pre_id == end_id_) {
-      // Allocate all probability mass to eos_id for finished branchs and the
-      // other candidate ids can be ignored.
-      items->emplace_back(offset, end_id_, pre_score);
-    } else {
-      for (size_t d = 0; d < instance_dim; d++) {
-        const size_t dim_offset = offset * instance_dim + d;
-        items->emplace_back(offset, ids_data[dim_offset],
-                            scores_data[dim_offset]);
-      }
-    }
-  }
-
-  sent_offset_++;
-  return true;
-}
-
-std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
-  os << "{";
-  os << "offset: " << item.offset << ", ";
-  os << "id: " << item.id << ", ";
-  os << "score: " << item.score << "";
-  os << "}";
-
-  return os;
-}
-
-std::string ItemToString(const BeamSearch::Item &item) {
-  std::ostringstream stream;
-  stream << item;
-  return stream.str();
-}
-
 class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -219,18 +29,23 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) The LoDTensor containing the selected ids at the "
              "previous step. It should be a tensor with shape (batch_size, 1) "
              "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
-             "thefirst step.");
+             "the first step.");
     AddInput("pre_scores",
              "(LoDTensor) The LoDTensor containing the accumulated "
              "scores corresponding to the selected ids at the previous step.");
     AddInput("ids",
              "(LoDTensor) The LoDTensor containing the candidates ids. Its "
-             "shape should be (batch_size * beam_size, K), where K supposed to "
-             "be beam_size.");
+             "shape should be (batch_size * beam_size, W). If not set, it will "
+             "be calculated out according to Input(scores) in this operator.")
+        .AsDispensable();
     AddInput("scores",
-             "(LoDTensor) The LodTensor containing the accumulated scores "
-             "corresponding to Input(ids) and its shape is the same as the "
-             "shape of Input(ids).");
+             "(LoDTensor) The LoDTensor containing the current scores "
+             "corresponding to Input(ids). If Input(ids) is not nullptr, its "
+             "shape is the same as that of Input(ids)."
+             "If is_accumulated is true, Input(scores) is accumulated scores "
+             "and will be used derectedly. Else, each score will be "
+             "transformed to the log field and accumulate Input(pre_sores) "
+             "first.");
     AddOutput("selected_ids",
               "A LodTensor that stores the IDs selected by beam search.");
     AddOutput("selected_scores",
@@ -242,6 +57,9 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("beam_size", "beam size for beam search");
     AddAttr<int>("end_id",
                  "the token id which indicates the end of a sequence");
+    AddAttr<bool>("is_accumulated",
+                  "Whether the Input(scores) is accumulated scores.")
+        .SetDefault(true);
 
     AddComment(R"DOC(
 This operator does the search in beams for one time step. 
@@ -265,10 +83,9 @@ class BeamSearchOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
     for (const std::string &arg :
-         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
+         std::vector<std::string>({"pre_ids", "scores"})) {
       PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
                      arg);
     }
@@ -279,12 +96,22 @@ class BeamSearchOp : public framework::OperatorWithKernel {
     }
   }
 
+ protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    framework::OpKernelType kt = framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("pre_ids")->type(),
-        platform::CPUPlace());
-    return kt;
+    auto *scores = ctx.Input<framework::LoDTensor>("scores");
+    size_t level = ctx.Attr<int>("level");
+    size_t batch_size = scores->lod()[level].size() - 1;
+    // The current CUDA kernel only support cases with batch_size < 4.
+    // Compute on CPU for cases with batch_size > 4.
+    if (batch_size <= 4) {
+      return framework::OpKernelType(
+          ctx.Input<framework::LoDTensor>("pre_ids")->type(), ctx.GetPlace());
+    } else {
+      return framework::OpKernelType(
+          ctx.Input<framework::LoDTensor>("pre_ids")->type(),
+          platform::CPUPlace());
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc
new file mode 100644
index 0000000000..4ef9476eee
--- /dev/null
+++ b/paddle/fluid/operators/beam_search_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/beam_search_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    beam_search,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index b5e2ed0592..1b939e742d 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,187 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/beam_search.h"
 
 namespace paddle {
 namespace operators {
 
-/*
- * This is an implementation of beam search.
- *
- * To explain the details, lets take machine translation task for example, in
- * this task, one source sentence is translated to multiple target sentences,
- * during this period, one sentence will be translated to multiple translation
- * prefixes(target sentence that have not ended), in each time step a prefix
- * will have some candidates, input the candidate ids and their corresponding
- * scores (probabilities), it will sort and select the top beam_size candidates
- * for each source sentence, and store the selected candidates's score and their
- * corresponding ids to LoDTensors.
- *
- * A detailed example:
- *
- * Input
- *
- * ids:
- * LoD (should have 2 levels)
- * first level: [0, 1, 4]
- * second level: [0, 1, 2, 3, 4]
- *
- * tensor's data
- * [
- * [4, 2, 5]
- * [2, 1, 3]
- * [3, 5, 2]
- * [8, 2, 1]
- * ]
- *
- * scores:
- * LoD same as `ids`
- * tensor's data
- * [
- * [0.5, 0.3, 0.2]
- * [0.6, 0.3, 0.1]
- * [0.9, 0.5, 0.1]
- * [0.7, 0.5, 0.1]
- * ]
- *
- * the inputs means that there are 2 source sentences to translate, and the
- * first source has 1 prefix, the second source has 2 prefix.
- *
- * lets assume beam size is 2, and the beam search's output should be
- * LoD
- * first level:
- * [0, 1, 2]
- * second level:
- * [0, 2, 4]
- *
- * id tensor's data
- * [[
- * 4,
- * 1,
- * 3,
- * 8,
- * ]]
- *
- * score tensor's data
- * [[
- * 0.5,
- * 0.3,
- * 0.9,
- * 0.7
- * ]]
- *
- * TODO all the prune operations should be in the beam search, so it is better
- * to split the beam search algorithm into a sequence of smaller operators, and
- * the prune operators can be inserted in this sequence.
- */
-class BeamSearch {
- public:
-  // TODO(superjom) make type customizable
-  using id_t = size_t;
-  using score_t = float;
-  /*
-   * Input the arguments that needed by this class.
-   */
-  BeamSearch(const framework::LoDTensor& ids,
-             const framework::LoDTensor& scores, size_t level, size_t beam_size,
-             int end_id)
-      : beam_size_(beam_size),
-        ids_(&ids),
-        scores_(&scores),
-        lod_level_(level),
-        end_id_(end_id) {}
-
-  /*
-   * The main function of beam search.
-   *
-   * @selected_ids: a [None, 1]-shaped tensor with LoD.
-   *   In a machine translation model, it might be the candidate term id sets,
-   *   each set stored as a varience-length sequence.
-   *   The format might be described with a two-level LoD
-   *   - [[0 1]
-   *   -  [0 1 2]]
-   *   - [[]
-   *   -  [0 1]]
-   *   the first level of LoD tells that there are two source sentences. The
-   *   second level describes the details of the candidate id set's offsets in
-   * the
-   *   source sentences.
-   *
-   *  @selected_scores: a LoD tensor with the same shape and LoD with
-   * selected_ids.
-   *   It stores the corresponding scores of candidate ids in selected_ids.
-   *
-   * Return false if all the input tensor is empty, in machine translation task
-   * that means no candidates is provided, and the task will stop running.
-   */
-  void operator()(const framework::LoDTensor& pre_ids,
-                  const framework::LoDTensor& pre_scores,
-                  framework::LoDTensor* selected_ids,
-                  framework::LoDTensor* selected_scores);
-  /*
-   * The basic items help to sort.
-   */
-  struct Item {
-    Item() {}
-    Item(size_t offset, size_t id, float score)
-        : offset(offset), id(id), score(score) {}
-    // offset in the higher lod level.
-    size_t offset;
-    // // prefix id in the lower lod level.
-    // size_t prefix;
-    // the candidate id
-    id_t id;
-    // the corresponding score
-    score_t score;
-  };
-
- protected:
-  /*
-   * Prune the source sentences all branchs finished, and it is optional.
-   * Pruning must one step later than finishing (thus pre_ids is needed here),
-   * since the end tokens must be writed out.
-   */
-  void PruneEndBeams(const framework::LoDTensor& pre_ids,
-                     std::vector<std::vector<Item>>* items);
-
-  /*
-   * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance.
-   */
-  std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>>& inputs, size_t element_num);
-
-  /*
-   * For each source, select top beam_size records.
-   */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
-      const framework::LoDTensor& pre_ids,
-      const framework::LoDTensor& pre_scores);
-
-  /*
-   * Get the items of next source sequence, return false if no remaining items.
-   */
-  bool NextItemSet(const framework::LoDTensor& pre_ids,
-                   const framework::LoDTensor& pre_scores,
-                   std::vector<Item>* items);
-
- private:
-  size_t beam_size_;
-  const framework::LoDTensor* ids_;
-  const framework::LoDTensor* scores_;
-  size_t lod_level_{0};
-  size_t sent_offset_{0};
-  int end_id_{0};
-};
-
-std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
-
-std::string ItemToString(const BeamSearch::Item& item);
-
 template <typename DeviceContext, typename T>
 class BeamSearchOpKernel : public framework::OpKernel<T> {
  public:
@@ -203,7 +28,7 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     auto* scores = context.Input<framework::LoDTensor>("scores");
     auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
     auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
-    PADDLE_ENFORCE_NOT_NULL(ids);
+
     PADDLE_ENFORCE_NOT_NULL(scores);
     PADDLE_ENFORCE_NOT_NULL(pre_ids);
     PADDLE_ENFORCE_NOT_NULL(pre_scores);
@@ -211,14 +36,20 @@ class BeamSearchOpKernel : public framework::OpKernel<T> {
     size_t level = context.Attr<int>("level");
     size_t beam_size = context.Attr<int>("beam_size");
     int end_id = context.Attr<int>("end_id");
-    BeamSearch alg(*ids, *scores, level, beam_size, end_id);
+    bool is_accumulated = context.Attr<bool>("is_accumulated");
+
     auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
     auto selected_scores =
         context.Output<framework::LoDTensor>("selected_scores");
     PADDLE_ENFORCE_NOT_NULL(selected_ids);
     PADDLE_ENFORCE_NOT_NULL(selected_scores);
-    alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
+
+    math::BeamSearchFunctor<DeviceContext, T> alg;
+    alg(context.template device_context<DeviceContext>(), pre_ids, pre_scores,
+        ids, scores, selected_ids, selected_scores, level, beam_size, end_id,
+        is_accumulated);
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
deleted file mode 100644
index 40b46781da..0000000000
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/beam_search_op.h"
-
-#include <gtest/gtest.h>
-#include <vector>
-
-namespace paddle {
-namespace test {
-
-using std::vector;
-using framework::LoDTensor;
-using framework::LoD;
-using operators::BeamSearch;
-using paddle::platform::CPUPlace;
-using std::cout;
-using std::endl;
-
-void CreateInput(LoDTensor* ids, LoDTensor* scores) {
-  LoD lod;
-  vector<size_t> level0({0, 2, 4});
-  vector<size_t> level1({0, 1, 2, 3, 4});
-  lod.push_back(level0);
-  lod.push_back(level1);
-  ids->set_lod(lod);
-  scores->set_lod(lod);
-
-  auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
-  ids->Resize(dims);
-  scores->Resize(dims);
-  CPUPlace place;
-
-  auto* ids_data = ids->mutable_data<int64_t>(place);
-  auto* scores_data = scores->mutable_data<float>(place);
-  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
-  vector<float> _scores(
-      {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
-
-  for (int i = 0; i < 12; i++) {
-    ids_data[i] = _ids[i];
-    scores_data[i] = _scores[i];
-  }
-}
-
-// It seems that beam_search_op has bugs.
-TEST(DISABLED_beam_search_op, run) {
-  CPUPlace place;
-  LoDTensor ids, scores;
-  CreateInput(&ids, &scores);
-
-  LoDTensor pre_ids;
-  pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
-  for (int i = 0; i < 4; i++) {
-    pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
-  }
-  LoDTensor pre_scores;
-  pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
-  for (int i = 0; i < 4; i++) {
-    pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
-  }
-
-  BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
-  LoDTensor sids, sscores;
-  beamsearch(pre_ids, pre_scores, &sids, &sscores);
-
-  LOG(INFO) << "score: " << sscores << endl;
-
-  ASSERT_EQ(sids.lod(), sscores.lod());
-
-  vector<int> tids({4, 2, 3, 8});
-  vector<float> tscores({0.5f, 0.6f, 0.9f, 0.7f});
-
-  for (int i = 0; i < 4; i++) {
-    ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
-    ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
-  }
-}
-
-}  // namespace test
-}  // namespace paddle
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
index e223be7af8..f9570e4e2e 100644
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -87,8 +87,8 @@ class BprLossGradientOpKernel : public framework::OpKernel<T> {
     auto* label = ctx.Input<Tensor>("Label");
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    const int step_size = x->dims()[0];
-    const int num_classes = x->dims()[1];
+    const size_t step_size = static_cast<size_t>(x->dims()[0]);
+    const size_t num_classes = static_cast<size_t>(x->dims()[1]);
     T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
     const T* dy_data = dy->data<T>();
     const T* x_data = x->data<T>();
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 6c85f1577e..d3a61dc367 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -45,3 +45,7 @@ detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op
 foreach(src ${LOCAL_DETECTION_LIBS})
     set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
 endforeach()
+
+cc_library(mask_util SRCS mask_util.cc DEPS memory)
+cc_test(mask_util_test SRCS mask_util_test.cc DEPS memory mask_util)
+detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util)
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index 6abeca1da4..b99edb5bf0 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -1,13 +1,17 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma once
 #include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
@@ -88,7 +92,9 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
       inter_w = std::max(x_max - x_min + 1, zero);
       inter_h = std::max(y_max - y_min + 1, zero);
       inter_area = inter_w * inter_h;
-      overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
+      overlaps_et(i, j) =
+          (inter_area == 0.) ? 0 : inter_area /
+                                       (r_box_area + c_box_area - inter_area);
     }
   }
 }
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
new file mode 100644
index 0000000000..46727c29de
--- /dev/null
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -0,0 +1,437 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <math.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+const int kBoxDim = 4;
+
+template <typename T>
+void AppendMask(LoDTensor* out, int64_t offset, Tensor* to_add) {
+  auto* out_data = out->data<T>();
+  auto* to_add_data = to_add->data<T>();
+  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
+}
+
+class GenerateMaskLabelsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
+                   "Input(GtClasses) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
+                   "Input(IsCrowd) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GtSegms"),
+                   "Input(GtSegms) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Rois"), "Input(Rois) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LabelsInt32"),
+                   "Input(LabelsInt32) shouldn't be null.");
+
+    PADDLE_ENFORCE(
+        ctx->HasOutput("MaskRois"),
+        "Output(MaskRois) of GenerateMaskLabelsOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("RoiHasMaskInt32"),
+        "Output(RoiHasMaskInt32) of GenerateMaskLabelsOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("MaskInt32"),
+        "Output(MaskInt32) of GenerateMaskLabelsOp should not be null");
+
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+    auto gt_segms_dims = ctx->GetInputDim("GtSegms");
+    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                      "The rank of Input(ImInfo) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_segms_dims.size(), 2,
+                      "The rank of Input(GtSegms) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_segms_dims[1], 2,
+                      "The second dim of Input(GtSegms) must be 2.");
+    int num_classes = ctx->Attrs().Get<int>("num_classes");
+    int resolution = ctx->Attrs().Get<int>("resolution");
+
+    ctx->SetOutputDim("MaskRois", {-1, 4});
+    ctx->SetOutputDim("RoiHasMaskInt32", {-1, 1});
+    ctx->SetOutputDim("MaskInt32", {-1, num_classes * resolution * resolution});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Rois"));
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+/*
+ * Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2)
+ * to encode class specific mask targets.
+ */
+template <typename T>
+static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
+                                    const Tensor& masks,
+                                    const Tensor& mask_class_labels,
+                                    const int resolution, const int num_classes,
+                                    Tensor* mask_targets) {
+  const uint8_t* masks_data = masks.data<uint8_t>();
+  int64_t num_mask = masks.dims()[0];
+  const int* mask_class_labels_data = mask_class_labels.data<int>();
+  const int M = resolution * resolution;
+  const int mask_dim = M * num_classes;
+
+  int* mask_targets_data =
+      mask_targets->mutable_data<int>({num_mask, mask_dim}, ctx.GetPlace());
+  math::set_constant(ctx, mask_targets, -1);
+  for (int64_t mask_id = 0; mask_id < num_mask; ++mask_id) {
+    int cls = mask_class_labels_data[mask_id];
+    int start = M * cls;
+    if (cls > 0) {
+      for (int i = 0; i < M; ++i) {
+        mask_targets_data[mask_id * mask_dim + start + i] =
+            static_cast<int>(masks_data[mask_id * M + i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+std::vector<Tensor> SampleMaskForOneImage(
+    const platform::CPUDeviceContext& ctx, const Tensor& im_info,
+    const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_segms,
+    const Tensor& rois, const Tensor& label_int32, const int num_classes,
+    const int resolution, const framework::LoD& segm_length) {
+  // Prepare the mask targets by associating one gt mask to each training roi
+  // that has a fg (non-bg) class label.
+  const int64_t gt_size = static_cast<int64_t>(gt_classes.dims()[0]);
+  const int64_t roi_size = static_cast<int64_t>(rois.dims()[0]);
+  const int* gt_classes_data = gt_classes.data<int>();
+  const int* is_crowd_data = is_crowd.data<int>();
+  const int* label_int32_data = label_int32.data<int>();
+  PADDLE_ENFORCE_EQ(roi_size, label_int32.dims()[0]);
+
+  std::vector<int> mask_gt_inds, fg_inds;
+  std::vector<std::vector<std::vector<T>>> gt_polys;
+
+  auto polys_num = segm_length[1];
+  auto segm_lod_offset = framework::ConvertToOffsetBasedLoD(segm_length);
+  auto lod1 = segm_lod_offset[1];
+  auto lod2 = segm_lod_offset[2];
+  const T* polys_data = gt_segms.data<T>();
+  for (int64_t i = 0; i < gt_size; ++i) {
+    if ((gt_classes_data[i] > 0) && (is_crowd_data[i] == 0)) {
+      mask_gt_inds.emplace_back(i);
+
+      // slice fg segmentation polys
+      int poly_num = polys_num[i];
+      std::vector<std::vector<T>> polys;
+      int s_idx = lod1[i];
+      for (int j = 0; j < poly_num; ++j) {
+        int s = lod2[s_idx + j];
+        int e = lod2[s_idx + j + 1];
+        PADDLE_ENFORCE_NE(s, e);
+        std::vector<T> plts(polys_data + s * 2, polys_data + e * 2);
+        polys.push_back(plts);
+      }
+      gt_polys.push_back(polys);
+    }
+  }
+  for (int64_t i = 0; i < roi_size; ++i) {
+    if (label_int32_data[i] > 0) {
+      fg_inds.emplace_back(i);
+    }
+  }
+  int gt_num = mask_gt_inds.size();
+  int fg_num = fg_inds.size();
+
+  Tensor boxes_from_polys;
+  boxes_from_polys.mutable_data<T>({gt_num, 4}, platform::CPUPlace());
+  Poly2Boxes(gt_polys, boxes_from_polys.data<T>());
+
+  std::vector<int> roi_has_mask =
+      std::vector<int>(fg_inds.begin(), fg_inds.end());
+  Tensor mask_class_labels;
+  Tensor masks;
+  Tensor rois_fg;
+
+  auto im_scale = im_info.data<T>()[2];
+  if (fg_num > 0) {
+    // Class labels for the foreground rois
+    mask_class_labels.mutable_data<int>({fg_num, 1}, ctx.GetPlace());
+    Gather<int>(label_int32_data, 1, fg_inds.data(), fg_inds.size(),
+                mask_class_labels.data<int>());
+
+    uint8_t* masks_data = masks.mutable_data<uint8_t>(
+        {fg_num, resolution * resolution}, ctx.GetPlace());
+
+    // Find overlap between all foreground rois and the bounding boxes
+    // enclosing each segmentation
+    T* rois_fg_data = rois_fg.mutable_data<T>({fg_num, 4}, ctx.GetPlace());
+    Gather<T>(rois.data<T>(), 4, fg_inds.data(), fg_inds.size(),
+              rois_fg.data<T>());
+
+    for (int k = 0; k < rois_fg.numel(); ++k) {
+      rois_fg_data[k] = rois_fg_data[k] / im_scale;
+    }
+
+    Tensor overlaps_bbfg_bbpolys;
+    overlaps_bbfg_bbpolys.mutable_data<T>({fg_num, gt_num}, ctx.GetPlace());
+    BboxOverlaps<T>(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys);
+
+    // Map from each fg rois to the index of the mask with highest overlap
+    // (measured by bbox overlap)
+    T* overlaps_bbfg_bbpolys_data = overlaps_bbfg_bbpolys.data<T>();
+    std::vector<int> fg_masks_inds;
+    for (int64_t i = 0; i < fg_num; ++i) {
+      const T* v = overlaps_bbfg_bbpolys_data + i * gt_num;
+      T max_overlap = std::numeric_limits<T>::min();
+      int id = 0;
+      for (int64_t j = 0; j < gt_num; ++j) {
+        if (v[j] > max_overlap) {
+          max_overlap = v[j];
+          id = j;
+        }
+      }
+      fg_masks_inds.push_back(id);
+    }
+
+    // add fg targets
+    for (int64_t i = 0; i < fg_num; ++i) {
+      int fg_polys_ind = fg_masks_inds[i];
+      T* roi_fg = rois_fg_data + i * 4;
+      uint8_t* mask = masks_data + i * resolution * resolution;
+      Polys2MaskWrtBox(gt_polys[fg_polys_ind], roi_fg, resolution, mask);
+    }
+  } else {
+    // The network cannot handle empty blobs, so we must provide a mask
+    // We simply take the first bg roi, given it an all -1's mask (ignore
+    // label), and label it with class zero (bg).
+    int bg_num = 1;
+    T* rois_fg_data = rois_fg.mutable_data<T>({bg_num, 4}, ctx.GetPlace());
+    const T* rois_data = rois.data<T>();
+    std::vector<int> bg_inds;
+    for (int64_t i = 0; i < roi_size; ++i) {
+      if (label_int32_data[i] == 0) {
+        bg_inds.emplace_back(i);
+        rois_fg_data[0] = rois_data[0] / im_scale;
+        rois_fg_data[1] = rois_data[1] / im_scale;
+        rois_fg_data[2] = rois_data[2] / im_scale;
+        rois_fg_data[3] = rois_data[3] / im_scale;
+        break;
+      }
+    }
+    masks.mutable_data<uint8_t>({bg_num, resolution * resolution},
+                                ctx.GetPlace());
+    math::set_constant(ctx, &masks, -1);
+    int* mask_class_labels_data =
+        mask_class_labels.mutable_data<int>({bg_num, 1}, ctx.GetPlace());
+    mask_class_labels_data[0] = 0;
+    roi_has_mask = std::vector<int>(bg_inds.begin(), bg_inds.end());
+  }
+
+  Tensor masks_expand;
+  ExpandMaskTarget<T>(ctx, masks, mask_class_labels, resolution, num_classes,
+                      &masks_expand);
+
+  T* rois_fg_data = rois_fg.data<T>();
+  for (int k = 0; k < rois_fg.numel(); ++k) {
+    rois_fg_data[k] = rois_fg_data[k] * im_scale;
+  }
+
+  Tensor roi_has_mask_t;
+  int roi_has_mask_size = roi_has_mask.size();
+  int* roi_has_mask_data =
+      roi_has_mask_t.mutable_data<int>({roi_has_mask_size, 1}, ctx.GetPlace());
+  std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data);
+
+  std::vector<Tensor> res;
+  res.emplace_back(rois_fg);
+  res.emplace_back(roi_has_mask_t);
+  res.emplace_back(masks_expand);
+  return res;
+}
+
+template <typename T>
+class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* im_info = ctx.Input<LoDTensor>("ImInfo");
+    auto* gt_classes = ctx.Input<LoDTensor>("GtClasses");
+    auto* is_crowd = ctx.Input<LoDTensor>("IsCrowd");
+    auto* gt_segms = ctx.Input<LoDTensor>("GtSegms");
+    auto* rois = ctx.Input<LoDTensor>("Rois");
+    auto* label_int32 = ctx.Input<LoDTensor>("LabelsInt32");
+
+    auto* mask_rois = ctx.Output<LoDTensor>("MaskRois");
+    auto* roi_has_mask_int32 = ctx.Output<LoDTensor>("RoiHasMaskInt32");
+    auto* mask_int32 = ctx.Output<LoDTensor>("MaskInt32");
+
+    int num_classes = ctx.Attr<int>("num_classes");
+    int resolution = ctx.Attr<int>("resolution");
+
+    PADDLE_ENFORCE_EQ(gt_classes->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp gt_classes needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp is_crowd needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(rois->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp rois needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(label_int32->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp label_int32 needs 1 level of LoD");
+
+    PADDLE_ENFORCE_EQ(gt_segms->lod().size(), 3UL);
+
+    int64_t n = static_cast<int64_t>(gt_classes->lod().back().size() - 1);
+    PADDLE_ENFORCE_EQ(gt_segms->lod()[0].size() - 1, n);
+
+    int mask_dim = num_classes * resolution * resolution;
+
+    mask_rois->mutable_data<T>({rois->numel(), kBoxDim}, ctx.GetPlace());
+    roi_has_mask_int32->mutable_data<int>({rois->numel(), 1}, ctx.GetPlace());
+    mask_int32->mutable_data<int>({rois->numel(), mask_dim}, ctx.GetPlace());
+
+    framework::LoD lod;
+    std::vector<size_t> lod0(1, 0);
+
+    int64_t num_mask = 0;
+    auto& dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+
+    auto gt_classes_lod = gt_classes->lod().back();
+    auto is_crowd_lod = is_crowd->lod().back();
+    auto rois_lod = rois->lod().back();
+    auto label_int32_lod = label_int32->lod().back();
+    auto gt_segms_lod = gt_segms->lod();
+
+    for (int i = 0; i < n; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor gt_classes_slice =
+          gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
+      Tensor is_crowd_slice =
+          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
+      Tensor label_int32_slice =
+          label_int32->Slice(label_int32_lod[i], label_int32_lod[i + 1]);
+      Tensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);
+
+      auto sub_lod_and_offset =
+          framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0);
+      auto lod_length = sub_lod_and_offset.first;
+      size_t s = sub_lod_and_offset.second.first;
+      size_t e = sub_lod_and_offset.second.second;
+      Tensor gt_segms_slice = gt_segms->Slice(s, e);
+
+      std::vector<Tensor> tensor_output = SampleMaskForOneImage<T>(
+          dev_ctx, im_info_slice, gt_classes_slice, is_crowd_slice,
+          gt_segms_slice, rois_slice, label_int32_slice, num_classes,
+          resolution, lod_length);
+
+      Tensor sampled_mask_rois = tensor_output[0];
+      Tensor sampled_roi_has_mask_int32 = tensor_output[1];
+      Tensor sampled_mask_int32 = tensor_output[2];
+
+      AppendMask<T>(mask_rois, kBoxDim * num_mask, &sampled_mask_rois);
+      AppendMask<int>(roi_has_mask_int32, num_mask,
+                      &sampled_roi_has_mask_int32);
+      AppendMask<int>(mask_int32, mask_dim * num_mask, &sampled_mask_int32);
+
+      num_mask += sampled_mask_rois.dims()[0];
+      lod0.emplace_back(num_mask);
+    }
+
+    lod.emplace_back(lod0);
+    mask_rois->set_lod(lod);
+    roi_has_mask_int32->set_lod(lod);
+    mask_int32->set_lod(lod);
+    mask_rois->Resize({num_mask, kBoxDim});
+    roi_has_mask_int32->Resize({num_mask, 1});
+    mask_int32->Resize({num_mask, mask_dim});
+  }
+};
+
+class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("ImInfo",
+             "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
+             "B is the number of input images, "
+             "each element consists of im_height, im_width, im_scale.");
+    AddInput("GtClasses",
+             "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+             "M is the number of groundtruth, "
+             "each element is a class label of groundtruth.");
+    AddInput(
+        "IsCrowd",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+        "M is the number of groundtruth, "
+        "each element is a flag indicates whether a groundtruth is crowd.");
+    AddInput(
+        "GtSegms",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [S, 2], it's LoD "
+        "level is 3. The LoD[0] represents the gt objects number of each "
+        "instance. LoD[1] represents the segmentation counts of each objects. "
+        "LoD[2] represents the polygons number of each segmentation. S the "
+        "total number of polygons coordinate points. Each element is (x, y) "
+        "coordinate points.");
+    AddInput(
+        "Rois",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [R, 4]. "
+        "R is the number of rois which is the output of "
+        "generate_proposal_labels, "
+        "each element is a bounding box with (xmin, ymin, xmax, ymax) format.");
+    AddInput("LabelsInt32",
+             "(LoDTensor), This intput is a 2D LoDTensor with shape [R, 1], "
+             "each element repersents a class label of a roi");
+    AddOutput(
+        "MaskRois",
+        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
+        "P is the number of mask, "
+        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
+    AddOutput("RoiHasMaskInt32",
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
+              "each element repersents the output mask rois index with regard "
+              "to input rois");
+    AddOutput("MaskInt32",
+              "(LoDTensor), This output is a 4D LoDTensor with shape [P, Q], "
+              "Q equal to num_classes * resolution * resolution");
+
+    AddAttr<int>("num_classes", "Class number.");
+    AddAttr<int>("resolution", "Resolution of mask.");
+
+    AddComment(R"DOC(
+This operator can be, for given the RoIs and corresponding labels,
+to sample foreground RoIs. This mask branch also has
+a :math: `K \\times M^{2}` dimensional output targets for each foreground
+RoI, which encodes K binary masks of resolution M x M, one for each of the
+K classes. This mask targets are used to compute loss of mask branch.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(generate_mask_labels, ops::GenerateMaskLabelsOp,
+                  ops::GenerateMaskLabelsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(generate_mask_labels,
+                       ops::GenerateMaskLabelsKernel<float>);
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index a652d4d957..5b2e571baf 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -48,20 +48,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
                    "Input(GtBoxes) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
 
-    PADDLE_ENFORCE(ctx->HasOutput("Rois"),
-                   "Output(Rois) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Rois"),
+        "Output(Rois) of GenerateProposalLabelsOp should not be null");
     PADDLE_ENFORCE(
         ctx->HasOutput("LabelsInt32"),
-        "Output(LabelsInt32) of RpnTargetAssignOp should not be null");
+        "Output(LabelsInt32) of GenerateProposalLabelsOp should not be null");
     PADDLE_ENFORCE(
         ctx->HasOutput("BboxTargets"),
-        "Output(BboxTargets) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("BboxInsideWeights"),
-        "Output(BboxInsideWeights) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("BboxOutsideWeights"),
-        "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");
+        "Output(BboxTargets) of GenerateProposalLabelsOp should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("BboxInsideWeights"),
+                   "Output(BboxInsideWeights) of GenerateProposalLabelsOp "
+                   "should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("BboxOutsideWeights"),
+                   "Output(BboxOutsideWeights) of GenerateProposalLabelsOp "
+                   "should not be null");
 
     auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
@@ -225,30 +226,36 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 
 template <typename T>
 std::vector<Tensor> SampleRoisForOneImage(
-    const platform::CPUDeviceContext& context, Tensor* rpn_rois,
-    Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info,
-    const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
-    const float bg_thresh_hi, const float bg_thresh_lo,
+    const platform::CPUDeviceContext& context, const Tensor& rpn_rois_in,
+    const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_boxes,
+    const Tensor& im_info, const int batch_size_per_im, const float fg_fraction,
+    const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo,
     const std::vector<float>& bbox_reg_weights, const int class_nums,
     std::minstd_rand engine, bool use_random) {
-  auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois);
-  auto im_scale = im_info->data<T>()[2];
-  rpn_rois_et = rpn_rois_et / im_scale;
+  auto im_scale = im_info.data<T>()[2];
+
+  Tensor rpn_rois;
+  rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
+  T* rpn_rois_dt = rpn_rois.data<T>();
+  const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
+  for (int i = 0; i < rpn_rois.numel(); ++i) {
+    rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
+  }
 
   Tensor boxes;
-  int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0];
+  int proposals_num = gt_boxes.dims()[0] + rpn_rois.dims()[0];
   boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-  Concat<T>(context, *gt_boxes, *rpn_rois, &boxes);
+  Concat<T>(context, gt_boxes, rpn_rois, &boxes);
 
   // Overlaps
   Tensor proposal_to_gt_overlaps;
-  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes->dims()[0]},
+  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
                                           context.GetPlace());
-  BboxOverlaps<T>(boxes, *gt_boxes, &proposal_to_gt_overlaps);
+  BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);
 
   // Generate proposal index
   std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
-      context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im,
+      context, &proposal_to_gt_overlaps, is_crowd, batch_size_per_im,
       fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
   std::vector<int> fg_inds = fg_bg_gt[0];
   std::vector<int> bg_inds = fg_bg_gt[1];
@@ -263,7 +270,7 @@ std::vector<Tensor> SampleRoisForOneImage(
   sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
   sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
   sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
+  GatherBoxesLabels<T>(context, boxes, gt_boxes, gt_classes, fg_inds, bg_inds,
                        gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
 
   // Compute targets
@@ -397,8 +404,8 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
       Tensor im_info_slice = im_info->Slice(i, i + 1);
       std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
-          dev_ctx, &rpn_rois_slice, &gt_classes_slice, &is_crowd_slice,
-          &gt_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction,
+          dev_ctx, rpn_rois_slice, gt_classes_slice, is_crowd_slice,
+          gt_boxes_slice, im_info_slice, batch_size_per_im, fg_fraction,
           fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
           engine, use_random);
       Tensor sampled_rois = tensor_output[0];
@@ -467,7 +474,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "P usuall equal to  batch_size_per_im * batch_size, "
         "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
     AddOutput("LabelsInt32",
-              "(LoDTensor), This output is a 2D LoDTensor with shape [P], "
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
               "each element repersents a class label of a roi");
     AddOutput("BboxTargets",
               "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc
new file mode 100644
index 0000000000..bd6fee7138
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include <math.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <limits>
+#include <utility>
+#include "paddle/fluid/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+uint32_t UMax(uint32_t a, uint32_t b) { return (a > b) ? a : b; }
+
+static inline int Compare(const void* a, const void* b) {
+  uint32_t c = *(reinterpret_cast<const uint32_t*>(a));
+  uint32_t d = *(reinterpret_cast<const uint32_t*>(b));
+  return c > d ? 1 : c < d ? -1 : 0;
+}
+
+void Decode(const uint32_t* cnts, int m, uint8_t* mask) {
+  uint8_t v = 0;
+  for (int j = 0; j < m; j++) {
+    for (uint32_t k = 0; k < cnts[j]; k++) {
+      *(mask++) = v;
+    }
+    v = !v;
+  }
+}
+
+typedef uint32_t uint;
+void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) {
+  int j, m = 0;
+  double scale = 5;
+  int *x, *y, *u, *v;
+  uint *a, *b;
+  platform::CPUPlace cpu;
+  auto xptr = memory::Alloc(cpu, sizeof(int) * (k + 1) * 2);
+  x = reinterpret_cast<int*>(xptr->ptr());
+  y = x + (k + 1);
+
+  for (j = 0; j < k; j++) x[j] = static_cast<int>(scale * xy[j * 2 + 0] + .5);
+  x[k] = x[0];
+  for (j = 0; j < k; j++) y[j] = static_cast<int>(scale * xy[j * 2 + 1] + .5);
+  y[k] = y[0];
+  for (j = 0; j < k; j++) {
+    m += UMax(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1])) + 1;
+  }
+  auto vptr = memory::Alloc(cpu, sizeof(int) * m * 2);
+  u = reinterpret_cast<int*>(vptr->ptr());
+  v = u + m;
+  m = 0;
+  for (j = 0; j < k; j++) {
+    int xs = x[j], xe = x[j + 1], ys = y[j], ye = y[j + 1], dx, dy, t, d;
+    int flip;
+    double s;
+    dx = abs(xe - xs);
+    dy = abs(ys - ye);
+    flip = (dx >= dy && xs > xe) || (dx < dy && ys > ye);
+    if (flip) {
+      t = xs;
+      xs = xe;
+      xe = t;
+      t = ys;
+      ys = ye;
+      ye = t;
+    }
+    if (dx >= dy) {
+      s = dx == 0 ? 0 : static_cast<double>(ye - ys) / dx;
+      for (d = 0; d <= dx; d++) {
+        t = flip ? dx - d : d;
+        u[m] = t + xs;
+        v[m] = static_cast<int>(ys + s * t + .5);
+        m++;
+      }
+    } else {
+      s = dy == 0 ? 0 : static_cast<double>(xe - xs) / dy;
+      for (d = 0; d <= dy; d++) {
+        t = flip ? dy - d : d;
+        v[m] = t + ys;
+        u[m] = static_cast<int>(xs + s * t + .5);
+        m++;
+      }
+    }
+  }
+  /* get points along y-boundary and downsample */
+  k = m;
+  m = 0;
+  double xd, yd;
+  auto xyptr = memory::Alloc(cpu, sizeof(int) * k * 2);
+  x = reinterpret_cast<int*>(xyptr->ptr());
+  y = x + k;
+  for (j = 1; j < k; j++) {
+    if (u[j] != u[j - 1]) {
+      xd = static_cast<double>(u[j] < u[j - 1] ? u[j] : u[j] - 1);
+      xd = (xd + .5) / scale - .5;
+      if (floor(xd) != xd || xd < 0 || xd > w - 1) continue;
+      yd = static_cast<double>(v[j] < v[j - 1] ? v[j] : v[j - 1]);
+      yd = (yd + .5) / scale - .5;
+      if (yd < 0)
+        yd = 0;
+      else if (yd > h)
+        yd = h;
+      yd = ceil(yd);
+      x[m] = static_cast<int>(xd);
+      y[m] = static_cast<int>(yd);
+      m++;
+    }
+  }
+  /* compute rle encoding given y-boundary points */
+  k = m;
+  auto aptr = memory::Alloc(cpu, sizeof(uint) * (k + 1));
+  a = reinterpret_cast<uint*>(aptr->ptr());
+  for (j = 0; j < k; j++) a[j] = static_cast<uint>(x[j] * h + y[j]);
+  a[k++] = static_cast<uint>(h * w);
+
+  qsort(a, k, sizeof(uint), Compare);
+  uint p = 0;
+  for (j = 0; j < k; j++) {
+    uint t = a[j];
+    a[j] -= p;
+    p = t;
+  }
+  auto bptr = memory::Alloc(cpu, sizeof(uint32_t) * k);
+  b = reinterpret_cast<uint32_t*>(bptr->ptr());
+  j = m = 0;
+  b[m++] = a[j++];
+  while (j < k) {
+    if (a[j] > 0) {
+      b[m++] = a[j++];
+    } else {
+      j++;
+      if (j < k) b[m - 1] += a[j++];
+    }
+  }
+
+  // convert to mask
+  auto mskptr = memory::Alloc(cpu, sizeof(uint8_t) * h * w);
+  uint8_t* msk = reinterpret_cast<uint8_t*>(mskptr->ptr());
+  Decode(b, m, msk);
+
+  for (int ii = 0; ii < h; ++ii) {
+    for (int jj = 0; jj < w; ++jj) {
+      mask[ii * w + jj] = msk[jj * h + ii];
+    }
+  }
+}
+
+void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
+                float* boxes) {
+  // lists
+  for (size_t i = 0; i < polys.size(); ++i) {
+    float x0 = std::numeric_limits<float>::max();
+    float x1 = std::numeric_limits<float>::min();
+    float y0 = std::numeric_limits<float>::max();
+    float y1 = std::numeric_limits<float>::min();
+    // each list may have more than one polys
+    for (size_t j = 0; j < polys[i].size(); ++j) {
+      for (size_t k = 0; k < polys[i][j].size() / 2; ++k) {
+        x0 = std::min(x0, polys[i][j][2 * k]);
+        x1 = std::max(x1, polys[i][j][2 * k]);
+        y0 = std::min(y0, polys[i][j][2 * k + 1]);
+        y1 = std::max(y1, polys[i][j][2 * k + 1]);
+      }
+    }
+    boxes[i * 4] = x0;
+    boxes[i * 4 + 1] = y0;
+    boxes[i * 4 + 2] = x1;
+    boxes[i * 4 + 3] = y1;
+  }
+}
+
+void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
+                      const float* box, int M, uint8_t* mask) {
+  float w = box[2] - box[0];
+  float h = box[3] - box[1];
+  w = std::max(w, static_cast<float>(1.));
+  h = std::max(h, static_cast<float>(1.));
+
+  uint8_t* msk = nullptr;
+  if (polygons.size() == 1UL) {
+    msk = mask;
+  } else {
+    msk = reinterpret_cast<uint8_t*>(
+        malloc(M * M * polygons.size() * sizeof(uint8_t)));
+  }
+  for (size_t i = 0; i < polygons.size(); ++i) {
+    int k = polygons[i].size() / 2;
+    std::vector<float> p;
+    for (int j = 0; j < k; ++j) {
+      float pw = (polygons[i][2 * j] - box[0]) * M / w;
+      float ph = (polygons[i][2 * j + 1] - box[1]) * M / h;
+      p.push_back(pw);
+      p.push_back(ph);
+    }
+    uint8_t* msk_i = msk + i * M * M;
+    Poly2Mask(p.data(), k, M, M, msk_i);
+  }
+
+  if (polygons.size() > 1UL) {
+    for (size_t i = 0; i < polygons.size(); ++i) {
+      uint8_t* msk_i = msk + i * M * M;
+      for (int j = 0; j < M * M; ++j) {
+        if (i == 0) {
+          mask[j] = msk_i[j];
+        } else {
+          mask[j] = (mask[j] + msk_i[j]) > 0 ? 1 : 0;
+        }
+      }
+    }
+    free(msk);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h
new file mode 100644
index 0000000000..4e0ea54f6d
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdint.h>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+void Poly2Mask(const float* ploy, int k, int h, int w, uint8_t* mask);
+
+void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
+                float* boxes);
+
+void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
+                      const float* box, int M, uint8_t* mask);
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/paddle/fluid/operators/detection/mask_util_test.cc
new file mode 100644
index 0000000000..de904e9474
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util_test.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+void Compare(const T* a, const T* b, const int n) {
+  for (int i = 0; i < n; i++) {
+    EXPECT_EQ(a[i], b[i]);
+  }
+}
+
+TEST(MaskUtil, Poly2MaskTest) {
+  float polys[] = {1.97f, 1.88f, 5.81f, 1.88f, 1.69f,
+                   6.53f, 5.94f, 6.38f, 1.97f, 1.88f};
+  int h = 8, w = 8;
+  int k = 5;  // length(polys) / 2
+  // clang-format off
+  uint8_t expect_mask[] = {
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 1, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 0, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0
+  };
+  // clang-format on
+
+  // the groud-truth mask is computed by coco API:
+  //
+  // import pycocotools.mask as mask_util
+  // import numpy as np
+  // segm = [1.97, 1.88, 5.81, 1.88, 1.69, 6.53, 5.94, 6.38, 1.97, 1.88]
+  // rles = mask_util.frPyObjects([segm], im_h, im_w)
+  // mask = mask_util.decode(rles)
+  // print mask
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_mask));
+  uint8_t* mask = reinterpret_cast<uint8_t*>(allocation->ptr());
+  Poly2Mask(polys, k, h, w, mask);
+  Compare<uint8_t>(expect_mask, mask, h * w);
+}
+
+TEST(MaskUtil, Poly2BoxesTest) {
+  // clang-format off
+  std::vector<std::vector<std::vector<float>>> polys = {
+      {{1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}},
+      {{2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}
+  };
+  float expect_boxes[] = {
+      1.69f, 1.88f, 5.94f, 6.53f,
+      1.69f, 0.88f, 6.94f, 6.63f
+  };
+  // clang-format on
+
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
+  float* boxes = reinterpret_cast<float*>(allocation->ptr());
+  Poly2Boxes(polys, boxes);
+  Compare<float>(expect_boxes, boxes, 8);
+}
+
+TEST(MaskUtil, Polys2MaskWrtBoxTest) {
+  // clang-format off
+  std::vector<std::vector<std::vector<float>>> polys = {{
+      {1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f},
+      {2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}};
+  float expect_boxes[] = {
+      1.69f, 0.88f, 6.94f, 6.63f
+  };
+  uint8_t expect_mask[] = {
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 1, 1, 1, 1, 1, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 1, 1, 1, 1, 1, 0, 0,
+      0, 1, 1, 1, 1, 1, 1, 0,
+      1, 1, 1, 1, 1, 1, 1, 1
+  };
+  // clang-format on
+
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
+  float* boxes = reinterpret_cast<float*>(allocation->ptr());
+  Poly2Boxes(polys, boxes);
+  Compare<float>(expect_boxes, boxes, 4);
+
+  auto allocat_mask = memory::Alloc(cpu, sizeof(expect_mask));
+  uint8_t* mask = reinterpret_cast<uint8_t*>(allocat_mask->ptr());
+  int M = 8;
+  Polys2MaskWrtBox(polys[0], expect_boxes, M, mask);
+  Compare<uint8_t>(expect_mask, mask, M * M);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 6a61a8d786..fc28fe818d 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WITH_GRPC)
         collective_client.cc collective_server.cc
         ${GRPC_SRCS}
       PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory)
+      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS})
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
@@ -32,15 +32,17 @@ else()
   set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
   set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
+  set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib)
+
   brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
       request_handler_impl.cc rpc_client.cc rpc_server.cc
       variable_response.cc
       collective_client.cc collective_server.cc
       ${BRPC_SRCS}
-    PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto
-    DEPS lod_tensor selected_rows memory)
+    PROTO send_recv.proto
+    DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS})
 
-  set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
+  set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS})
   cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
       DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL)
 endif()
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
index 87bdb83503..b8e63f42e2 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -62,7 +62,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
   const std::string var_name_val = var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = "SendRPC";
+  const std::string method = kSendRPC;
   VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
 
   framework::AsyncIO([=] {
@@ -156,15 +156,18 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
                                       const platform::DeviceContext& ctx,
                                       const framework::Scope& scope,
                                       const std::string& var_name,
+                                      const std::string& out_var_name,
                                       const std::string& method_name,
                                       int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
+  const std::string out_varname_val = out_var_name;
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = "GetRPC";
-  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+  const std::string method = kGetRPC;
+  VarHandlePtr var_h(
+      new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
 
   framework::AsyncIO([=] {
     auto ch_ctx = ch_ptr->Pop();
@@ -175,6 +178,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
 
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
+    req.set_out_varname(out_varname_val);
     req.set_trainer_id(trainer_id_);
 
     google::protobuf::Closure* done = brpc::NewCallback(
@@ -182,8 +186,10 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
 
     platform::RecordRPCEvent record_event(method, p_ctx);
 
-    if (method_name == "GetMonomerVariable") {
+    if (method_name == kGetMonomerRPC) {
       ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
+    } else if (method_name == kGetNoBarrierRPC) {
+      ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done);
     } else {
       ch_ctx->stub->GetVariable(cntl, &req, response, done);
     }
@@ -198,25 +204,39 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
   return var_h;
 }
 
+VarHandlePtr BRPCClient::AsyncGetVarNoBarrier(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    const std::string& out_var_name, int64_t time_out) {
+  std::string var_name_no_barrier =
+      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
+
+  return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name,
+                      kGetNoBarrierRPC, time_out);
+}
+
 VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
     const std::string& ep, const platform::DeviceContext& ctx,
     const framework::Scope& scope, const std::string& var_name,
     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out);
+  return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC,
+                      time_out);
 }
 
 VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
                                                 const std::string& var_name,
                                                 int64_t time_out) {
-  return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out);
+  return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out);
 }
 
 VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
+                                     const std::string& out_var_name,
                                      int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out);
+  return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
+                      time_out);
 }
 
 VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
@@ -234,7 +254,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch_ptr = GetChannel(ep_val);
 
-  const std::string method = "PrefetchRPC";
+  const std::string method = kPrefetchRPC;
 
   VarHandlePtr var_h(
       new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
@@ -270,7 +290,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
 
 VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
                                                int64_t time_out) {
-  return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE,
+  return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE,
                           time_out);
 }
 
@@ -286,7 +306,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   sendrecv::VariableMessage req;
   req.set_varname(FETCH_BARRIER_MESSAGE);
 
-  const std::string method = "FetchBarrierRPC";
+  const std::string method = kFetchBarrierRPC;
   // var handle
   VarHandlePtr var_h(
       new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
@@ -367,7 +387,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
 
 VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
                                            int64_t time_out) {
-  return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out);
+  return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out);
 }
 
 void BRPCClient::SendComplete() {
@@ -394,9 +414,9 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
   google::protobuf::Closure* done = brpc::NewCallback(
       &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
 
-  if (method_name == "CheckPointNotifyRPC") {
+  if (method_name == kCheckPointNotifyRPC) {
     ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
-  } else if (method_name == "GetMonomerBarrier") {
+  } else if (method_name == kSendMonomerFetchBarrierRPC) {
     ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
   } else {
     ch_ctx->stub->SendVariable(cntl, &req, response, done);
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
index 2066ade8a5..501a593b11 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h
@@ -65,6 +65,7 @@ class BRPCClient : public RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
+                           const std::string& out_var_name,
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncGetMonomerBarrier(
@@ -76,6 +77,13 @@ class BRPCClient : public RPCClient {
       const framework::Scope& scope, const std::string& var_name,
       int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep,
+                                    const platform::DeviceContext& ctx,
+                                    const framework::Scope& scope,
+                                    const std::string& var_name,
+                                    const std::string& out_varname,
+                                    int64_t time_out = FLAGS_rpc_deadline);
+
   VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                 const platform::DeviceContext& ctx,
                                 const framework::Scope& scope,
@@ -103,6 +111,7 @@ class BRPCClient : public RPCClient {
                             const platform::DeviceContext& ctx,
                             const framework::Scope& scope,
                             const std::string& var_name,
+                            const std::string& out_var_name,
                             const std::string& method_name,
                             int64_t time_out = FLAGS_rpc_deadline);
 
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
index cbe0bd09c7..fea9b09414 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
@@ -45,6 +45,13 @@ class BRPCServiceImpl : public SendRecvService {
           rpc_server_->GetThreadNum(distributed::kRequestGet)));
     }
 
+    it = rpc_call_map.find(distributed::kRequestGetNoBarrier);
+    if (it != rpc_call_map.end()) {
+      request_getnobarrier_h_ = it->second;
+      getnobarrier_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier)));
+    }
+
     it = rpc_call_map.find(distributed::kRequestPrefetch);
     if (it != rpc_call_map.end()) {
       request_prefetch_h_ = it->second;
@@ -112,6 +119,14 @@ class BRPCServiceImpl : public SendRecvService {
         [=] { _GetVariable(cntl_butil, request, response, done); });
   }
 
+  void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
+                            const VariableMessage* request,
+                            VariableMessage* response,
+                            google::protobuf::Closure* done) override {
+    getnobarrier_threads_->Run(
+        [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); });
+  }
+
   void _GetVariable(google::protobuf::RpcController* cntl_butil,
                     const VariableMessage* request, VariableMessage* response,
                     google::protobuf::Closure* done) {
@@ -122,23 +137,59 @@ class BRPCServiceImpl : public SendRecvService {
     brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
 
     std::string varname = request->varname();
+    std::string out_varname = request->out_varname();
     VLOG(3) << "RequestGet varname:" << varname
+            << ", out_varname:" << out_varname
             << ", trainer_id:" << request->trainer_id()
             << ", from:" << cntl->remote_side();
 
     auto scope = request_get_h_->scope();
-    auto invar = scope->FindVar(varname);
+    paddle::framework::Variable* invar = nullptr;
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id,
+                           out_varname);
+
+    if (outvar) {
+      distributed::SerializeToIOBuf(out_varname, outvar,
+                                    *request_get_h_->dev_ctx(), response,
+                                    &cntl->response_attachment(), "", false);
+    }
+  }
+
+  void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
+                             const VariableMessage* request,
+                             VariableMessage* response,
+                             google::protobuf::Closure* done) {
+    PADDLE_ENFORCE(request_getnobarrier_h_ != nullptr,
+                   "RequestGetNoBarrier handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    std::string varname = request->varname();
+    std::string out_varname = request->out_varname();
     int trainer_id = request->trainer_id();
+
+    VLOG(3) << "RequestGetNoBarrier varname:" << varname
+            << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id
+            << ", from:" << cntl->remote_side();
+
+    auto scope = request_getnobarrier_h_->scope();
+    paddle::framework::Variable* invar = nullptr;
     paddle::framework::Variable* outvar = nullptr;
 
-    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id);
+    request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id,
+                                    out_varname);
 
     if (outvar) {
-      distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(),
-                                    response, &cntl->response_attachment(), "",
-                                    false);
+      distributed::SerializeToIOBuf(
+          out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response,
+          &cntl->response_attachment(), "", false);
     }
   }
+
   void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
                         const VariableMessage* request,
                         VariableMessage* response,
@@ -282,6 +333,7 @@ class BRPCServiceImpl : public SendRecvService {
  private:
   distributed::RequestHandler* request_send_h_{nullptr};
   distributed::RequestHandler* request_get_h_{nullptr};
+  distributed::RequestHandler* request_getnobarrier_h_{nullptr};
   distributed::RequestHandler* request_prefetch_h_{nullptr};
   distributed::RequestHandler* request_checkpoint_h_{nullptr};
   distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
@@ -289,9 +341,10 @@ class BRPCServiceImpl : public SendRecvService {
 
   distributed::RPCServer* rpc_server_{nullptr};
 
-  // FIXME(gongwb): brpc should support process one rpce use one threadpool.
+  // FIXME(gongwb): brpc should support process one rpc use one threadpool.
   std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
   std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> getnobarrier_threads_;
   std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
   std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
 };
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
index 5009058422..90f2f9fd65 100644
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor_util.h"
 
 #include "paddle/fluid/operators/distributed/collective_client.h"
 #include "paddle/fluid/operators/distributed/collective_server.h"
@@ -57,7 +58,7 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
   auto* tensor = slr->mutable_value();
   auto* rows = slr->mutable_rows();
 
-  tensor->Resize(framework::make_ddim({20000, 1024}));
+  tensor->Resize(framework::make_ddim({3, 1024}));
   tensor->mutable_data<float>(place);
 
   paddle::operators::math::set_constant(ctx, tensor, 32.7);
@@ -80,6 +81,20 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
   std::vector<const framework::SelectedRows*> dst;
   client->Gather(vars, &dst, *dev_ctx, scope);
   std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
+  dev_ctx->Wait();
+
+  ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024}));
+  ASSERT_EQ(dst[0]->height(), 20000);
+  ASSERT_EQ(dst[0]->rows().size(), static_cast<size_t>(3));
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ(dst[0]->rows()[i], i);
+  }
+
+  std::vector<float> vec;
+  TensorToVector(dst[0]->value(), *dev_ctx, &vec);
+  for (size_t i = 0; i < 3 * 1024; i++) {
+    ASSERT_FLOAT_EQ(vec[i], 32.7);
+  }
 }
 
 TEST(CollectiveServer, GPU) {
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 7875c16c3c..52310f8d04 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -74,7 +74,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   SendProcessor* s = new SendProcessor(ch);
-  const std::string method = "SendRPC";
+  const std::string method = kSendRPC;
   VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
@@ -107,7 +107,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
 
 void ProcGetResponse(const VarHandle& var_h,
                      const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(100) << "ProcGetResponse";
+  VLOG(4) << "ProcGetResponse";
   framework::Variable* outvar = nullptr;
   // get response's trainer_id is not used
   int trainer_id;
@@ -127,59 +127,74 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
+                                     const std::string& out_varname,
                                      int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name,
+  return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
                       "/sendrecv.SendRecvService/GetVariable", time_out);
 }
 
+VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    const std::string& out_varname, int64_t time_out) {
+  std::string var_name_no_barrier =
+      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
+
+  return _AsyncGetVar(
+      ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
+      "/sendrecv.SendRecvService/GetVariableNoBarrier", time_out);
+}
+
 VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
     const std::string& ep, const platform::DeviceContext& ctx,
     const framework::Scope& scope, const std::string& var_name,
     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name,
+  return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
                       "/sendrecv.SendRecvService/GetMonomerVariable", time_out);
 }
 
-VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      const std::string& rpc_path,
-                                      int64_t time_out) {
+VarHandlePtr GRPCClient::_AsyncGetVar(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& method,
+    const std::string& var_name, const std::string& out_varname,
+    const std::string& rpc_path, int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
+  const std::string out_varname_val = out_varname;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
-  const std::string method = "GetRPC";
-  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+
+  VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] {
-    // prepare input
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-    req.set_trainer_id(trainer_id_);
-    ::grpc::ByteBuffer buf;
-    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
+  framework::AsyncIO(
+      [var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] {
+        // prepare input
+        sendrecv::VariableMessage req;
+        req.set_varname(var_name_val);
+        req.set_out_varname(out_varname_val);
+        req.set_trainer_id(trainer_id_);
+        ::grpc::ByteBuffer buf;
+        RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+        VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
-    // stub context
-    s->response_call_back_ = ProcGetResponse;
+        // stub context
+        s->response_call_back_ = ProcGetResponse;
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+        platform::RecordRPCEvent record_event(method, p_ctx);
 
-    auto call =
-        s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+        auto call =
+            s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
+        call->StartCall();
+        call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
 
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      h->Wait();
-    }
-  });
+        if (UNLIKELY(platform::IsProfileEnabled())) {
+          h->Wait();
+        }
+      });
 
   req_count_++;
 
@@ -202,7 +217,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
 
-  const std::string method = "PrefetchRPC";
+  const std::string method = kPrefetchRPC;
 
   VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
@@ -242,7 +257,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "BatchBarrierRPC";
+  const std::string method = kBatchBarrierRPC;
   VarHandlePtr h(
       new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
@@ -267,7 +282,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
                                                int64_t time_out) {
   const auto ch = GetChannel(ep);
   FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  const std::string method = "FetchBarrierRPC";
+  const std::string method = kFetchBarrierRPC;
   VarHandlePtr h(
       new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
@@ -293,7 +308,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
                                                 int64_t time_out) {
   const auto ch = GetChannel(ep);
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "SendMonomerFetchBarrierRPC";
+  const std::string method = kSendMonomerFetchBarrierRPC;
   VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
   s->Prepare(h, time_out);
 
@@ -320,7 +335,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "SendCompleteRPC";
+  const std::string method = kSendCompleteRPC;
   VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
 
@@ -347,7 +362,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
 
   CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
 
-  const std::string method = "CheckPointNotifyRPC";
+  const std::string method = kCheckPointNotifyRPC;
 
   VarHandlePtr h(
       new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index fa77d21257..ce0d2152aa 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -186,8 +186,15 @@ class GRPCClient : public RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
+                           const std::string& out_varname,
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncGetVarNoBarrier(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      const std::string& out_varname,
+      int64_t time_out = FLAGS_rpc_deadline) override;
+
   VarHandlePtr AsyncGetMonomerVariable(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& var_name,
@@ -228,11 +235,11 @@ class GRPCClient : public RPCClient {
   void Proceed();
 
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-  VarHandlePtr _AsyncGetVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name, const std::string& rpc,
-                            int64_t time_out);
+  VarHandlePtr _AsyncGetVar(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& method,
+      const std::string& var_name, const std::string& out_varname,
+      const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline);
 
  private:
   grpc::CompletionQueue cq_;
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index 08f777e279..4a9c158cb0 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -136,17 +136,65 @@ class RequestGet final : public RequestBase {
   void Process() override {
     // proc request.
     std::string varname = request_.varname();
+    std::string out_varname = request_.out_varname();
     int trainer_id = request_.trainer_id();
-    VLOG(4) << "RequestGet " << varname;
+
+    VLOG(4) << "RequestGet " << out_varname << " from " << varname;
 
     auto scope = request_handler_->scope();
-    auto invar = scope->FindVar(varname);
+    framework::Variable* invar = nullptr;
     framework::Variable* outvar = nullptr;
 
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
+                             out_varname);
 
     if (outvar) {
-      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
+                            &reply_);
+    }
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+};
+
+class RequestGetNoBarrier final : public RequestBase {
+ public:
+  explicit RequestGetNoBarrier(GrpcService::AsyncService* service,
+                               ::grpc::ServerCompletionQueue* cq,
+                               RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    auto method_id =
+        static_cast<int>(distributed::GrpcMethod::kGetVariableNoBarrier);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestGetNoBarrier() {}
+
+  std::string GetReqName() override { return request_.varname(); }
+
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+    std::string out_varname = request_.out_varname();
+    int trainer_id = request_.trainer_id();
+
+    VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname;
+
+    auto scope = request_handler_->scope();
+    framework::Variable* invar = nullptr;
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
+                             out_varname);
+
+    if (outvar) {
+      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
                             &reply_);
     }
     Finish(reply_, &responder_);
@@ -460,6 +508,9 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
     b = new RequestSend(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestGet) {
     b = new RequestGet(&service_, cq.get(), handler, req_id);
+
+  } else if (rpc_name == kRequestGetNoBarrier) {
+    b = new RequestGetNoBarrier(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestGetMonomerVariable) {
     b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id,
                                       this);
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
index 0b5c5151e6..2965fe4490 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h
@@ -81,6 +81,7 @@ enum class GrpcMethod {
   kGetVariable,
   kPrefetchVariable,
   kCheckpointNotify,
+  kGetVariableNoBarrier,
   kGetMonomerVariable,
   kGetMonomerBarrier,
 };
@@ -94,6 +95,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kGetVariableNoBarrier:
+      return "/sendrecv.SendRecvService/GetVariableNoBarrier";
     case GrpcMethod::kGetMonomerVariable:
       return "/sendrecv.SendRecvService/GetMonomerVariable";
     case GrpcMethod::kGetMonomerBarrier:
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 62b24f150b..991158ac72 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -42,11 +42,24 @@ constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
 constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
+constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
+
+constexpr char kSendRPC[] = "SendRPC";
+constexpr char kGetRPC[] = "GetRPC";
+constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC";
+constexpr char kGetMonomerRPC[] = "GetMonomerRPC";
+constexpr char kPrefetchRPC[] = "PrefetchRPC";
+constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC";
+constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
+constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
+constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
+constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 #define COMPLETE_MESSAGE "COMPLETE@RECV"
+#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV"
 
 #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
 #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 9722f8c96e..a1c5c07774 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/string/piece.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -53,6 +54,11 @@ bool RequestSendHandler::Handle(const std::string& varname,
     // Async
     if (!sync_mode_) {
       VLOG(3) << "async process var: " << varname;
+      if (varname == BATCH_BARRIER_MESSAGE) {
+        PADDLE_THROW(
+            "async mode should not recv BATCH_BARRIER_MESSAGE or "
+            "COMPLETE_MESSAGE");
+      }
       try {
         executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
                                       scope);
@@ -81,7 +87,8 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                const int trainer_id,
                                const std::string& out_var_name,
                                const std::string& table_name) {
-  VLOG(4) << "RequestGetHandler:" << varname;
+  VLOG(4) << "RequestGetHandler:" << varname
+          << " out_var_name: " << out_var_name;
 
   if (sync_mode_) {
     if (varname == FETCH_BARRIER_MESSAGE) {
@@ -112,6 +119,32 @@ bool RequestGetHandler::Handle(const std::string& varname,
   return true;
 }
 
+bool RequestGetNoBarrierHandler::Handle(const std::string& varname,
+                                        framework::Scope* scope,
+                                        framework::Variable* invar,
+                                        framework::Variable** outvar,
+                                        const int trainer_id,
+                                        const std::string& out_var_name,
+                                        const std::string& table_name) {
+  VLOG(4) << "RequestGetNoBarrierHandler:" << varname
+          << " out_var_name: " << out_var_name;
+
+  // get var from pserver immediately without barriers
+  string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE);
+  string::Piece var_name_piece = string::Piece(varname);
+
+  if (string::Contains(var_name_piece, without_barrier_piece)) {
+    var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece);
+    VLOG(4) << "Get var " << var_name_piece << " with "
+            << WITHOUT_BARRIER_MESSAGE;
+    *outvar = scope_->FindVar(var_name_piece.ToString());
+    return true;
+  } else {
+    PADDLE_THROW("GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE);
+  }
+  return true;
+}
+
 bool RequestPrefetchHandler::Handle(const std::string& varname,
                                     framework::Scope* scope,
                                     framework::Variable* invar,
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index 5e0b25c5c2..f3c1b24526 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -67,6 +67,16 @@ class RequestGetHandler final : public RequestHandler {
   bool enable_dc_asgd_;
 };
 
+class RequestGetNoBarrierHandler final : public RequestHandler {
+ public:
+  RequestGetNoBarrierHandler() : RequestHandler(false) {}
+  virtual ~RequestGetNoBarrierHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
+};
+
 static inline void BuildVar(const std::string& param_name,
                             std::initializer_list<const char*> arguments,
                             paddle::framework::proto::OpDesc::Var* var) {
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index b668d86978..ea54e0c295 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -43,8 +43,15 @@ class RPCClient {
                                    const platform::DeviceContext& ctx,
                                    const framework::Scope& scope,
                                    const std::string& var_name,
+                                   const std::string& out_varname,
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
 
+  virtual VarHandlePtr AsyncGetVarNoBarrier(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      const std::string& out_varname,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
   virtual VarHandlePtr AsyncGetMonomerVariable(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& var_name,
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index cc5b9c29a1..c3a46e348c 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -39,27 +39,33 @@ void RPCServer::SavePort() const {
   port_file.open(file_path);
   port_file << selected_port_;
   port_file.close();
-  VLOG(4) << "selected port written to " << file_path;
+  VLOG(3) << "selected port written to " << file_path;
 }
 
 void RPCServer::WaitBarrier(const std::string& rpc_name) {
+  VLOG(3) << "WaitBarrier in: " << rpc_name;
   std::unique_lock<std::mutex> lock(this->mutex_);
   barrier_cond_.wait(lock, [this, &rpc_name] {
     return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
             exit_flag_.load());
   });
 
-  VLOG(3) << "batch_barrier_: " << rpc_name << " "
-          << barrier_counter_[rpc_name];
+  VLOG(3) << "WaitBarrier out: " << rpc_name
+          << " counter: " << barrier_counter_[rpc_name];
 }
 
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  // barrier msg should make sure that it's in the right cond(send|recv)
+  WaitCond(rpc_name);
   int b = 0;
   std::unique_lock<std::mutex> lock(mutex_);
   b = ++barrier_counter_[rpc_name];
+  VLOG(3) << rpc_name << " barrier_counter: " << b;
   if (b >= client_num_) {
     lock.unlock();
+    VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for "
+            << rpc_name;
     barrier_cond_.notify_all();
     lock.lock();
   }
@@ -71,7 +77,7 @@ void RPCServer::Complete() {
     client_num_--;
     need_reset_all_vars_ = true;
 
-    VLOG(4) << "decrease client_num to: " << client_num_;
+    VLOG(3) << "decrease client_num to: " << client_num_;
     if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
       barrier_counter_[kRequestGet]--;
     }
@@ -105,8 +111,8 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
 
   static int cond = -1;
   rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
-          << ", cond:" << rpc_cond_map_[rpc_name];
+  VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler
+          << ", cond: " << rpc_cond_map_[rpc_name];
 }
 
 void RPCServer::SetCond(const std::string& rpc_name) {
@@ -120,7 +126,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }
 
 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(4) << "RPCServer WaitCond " << rpc_name;
+  VLOG(3) << "RPCServer WaitCond in " << rpc_name;
   int cond = 0;
   {
     std::unique_lock<std::mutex> lock(mutex_);
@@ -130,6 +136,7 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
   std::unique_lock<std::mutex> lock(mutex_);
   rpc_cond_.wait(
       lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
+  VLOG(3) << "RPCServer WaitCond out " << rpc_name;
 }
 
 void RPCServer::RegisterVar(const std::string& var_name,
@@ -151,7 +158,7 @@ void RPCServer::RegisterVar(const std::string& var_name,
   }
 
   rpc_cond_.notify_all();
-  VLOG(4) << "RegisterVar context:" << h.String();
+  VLOG(3) << "RegisterVar context:" << h.String();
 }
 
 void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
@@ -167,11 +174,11 @@ void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
     barrier_cond_.notify_all();
   }
 
-  VLOG(4) << "IncreaseVarBarrier context:" << h.String();
+  VLOG(3) << "IncreaseVarBarrier context:" << h.String();
 }
 
 void RPCServer::WaitVarBarrier(const std::string& var_name) {
-  VLOG(4) << "WaitBarrier var_name:" << var_name;
+  VLOG(3) << "WaitVarBarrier var_name:" << var_name;
 
   std::unique_lock<std::mutex> lock(mutex_);
   barrier_cond_.wait(lock, [&]() {
@@ -179,11 +186,11 @@ void RPCServer::WaitVarBarrier(const std::string& var_name) {
             exit_flag_.load());
   });
 
-  VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String();
+  VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String();
 }
 
 void RPCServer::SetVarCond(const std::string& var_name) {
-  VLOG(4) << "SetVarCond var_name:" << var_name;
+  VLOG(3) << "SetVarCond var_name:" << var_name;
   {
     std::unique_lock<std::mutex> lock(mutex_);
     if (var_map_.find(var_name) != var_map_.end()) {
@@ -193,14 +200,14 @@ void RPCServer::SetVarCond(const std::string& var_name) {
 }
 
 void RPCServer::WaitVarCond(const std::string& var_name) {
-  VLOG(4) << "WaitVarCond var_name:" << var_name;
+  VLOG(3) << "WaitVarCond var_name:" << var_name;
 
   std::unique_lock<std::mutex> lock(mutex_);
   rpc_cond_.wait(lock, [=] {
     return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
   });
 
-  VLOG(4) << "WaitVarCond var_name:" << var_name << " end";
+  VLOG(3) << "WaitVarCond var_name:" << var_name << " end";
 }
 
 MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index b39eef04d8..6303667884 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -17,8 +17,14 @@ package sendrecv;
 option cc_generic_services = @cc_generic_services@;
 
 service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  // TODO(typhoonzero): add streaming API
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {}
+  // pre-fetch variable by given variable name and Ids
   rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 
   rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
@@ -27,12 +33,17 @@ service SendRecvService {
   rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
 
+// It can be: LoDTensor、SelectedRows or NCCL_ID
 enum VarType {
   LOD_TENSOR = 0;
   SELECTED_ROWS = 1;
   NCCL_ID = 2;
 }
 
+// VariableMessage is serialized paddle variable message.
+// NOTICE(gongwb):don't modify this proto if you are not
+//   not familar with how we serialize in sendrecvop_utils.h
+//   and deserilize it in  variable_response.h.
 message VariableMessage {
   enum Type {
     // Pod Types
@@ -49,14 +60,21 @@ message VariableMessage {
   string varname = 1;
   // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
   VarType type = 2;
+  // bool persistable is not needed for sending.
+  // tensor info:
   Type data_type = 3;
   repeated int64 dims = 4;
 
+  // lod details:
   int64 lod_level = 5;
   repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
   int64 slr_height = 7;
+  // tensor data
   bytes serialized = 8;
+  // selected_rows data
   bytes rows = 9;
+  // Look up table block execution output variable name.
   string out_varname = 10;
   // If 1, the ps server will start profiling, the ps
   // server stops profiling and generates a profile to /tmp/profile_ps_*
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 47ff568a11..7825b4fc82 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -117,8 +117,9 @@ bool VariableResponse::CopyLodTensorData(
       tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type()));
 
   VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
-          << ", Buffer Size = " << length;
-  PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast<unsigned int>(length));
+          << ", Buffer Size = " << length << ", dims:" << dims
+          << ", numel:" << tensor->numel();
+  PADDLE_ENFORCE_GE(tensor->memory_size(), static_cast<unsigned int>(length));
   return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
 }
 
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 629f364d71..5b30ed472d 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -137,7 +137,9 @@ void ListenAndServOp::RunSyncLoop(
   while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
+    VLOG(3) << "wait all clients to send gradient";
     rpc_service_->SetCond(distributed::kRequestSend);
+    VLOG(3) << "wait all clients to send send_barrier";
     rpc_service_->WaitBarrier(distributed::kRequestSend);
 
     if (rpc_service_->IsExit()) {
@@ -168,12 +170,16 @@ void ListenAndServOp::RunSyncLoop(
     }
     ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
                           recv_scope);
-    VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
+    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
+    VLOG(3) << "ResetReceivedVars";
     ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
 
+    VLOG(3) << "wait all clients to get parameters back";
     rpc_service_->SetCond(distributed::kRequestGet);
+    VLOG(3) << "wait all clients to send fetch_barrier";
     rpc_service_->WaitBarrier(distributed::kRequestGet);
+    VLOG(3) << "ResetBarrierCounter";
     rpc_service_->ResetBarrierCounter();
   }  // while(true)
 }
@@ -347,6 +353,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       new distributed::RequestPrefetchHandler(sync_mode));
   request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
       sync_mode, checkpoint_block_id));
+  request_get_no_barrier_handler_.reset(
+      new distributed::RequestGetNoBarrierHandler());
 
   rpc_service_->RegisterRPC(distributed::kRequestSend,
                             request_send_handler_.get(),
@@ -359,6 +367,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                             FLAGS_rpc_prefetch_thread_num);
   rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
                             request_checkpoint_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier,
+                            request_get_no_barrier_handler_.get());
 
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -413,6 +423,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   f(request_get_handler_.get());
   f(request_prefetch_handler_.get());
   f(request_checkpoint_handler_.get());
+  f(request_get_no_barrier_handler_.get());
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
index 9431978df8..f20442bad7 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -55,7 +55,6 @@ class ListenAndServOp : public framework::OperatorBase {
                   const framework::VariableNameMap& inputs,
                   const framework::VariableNameMap& outputs,
                   const framework::AttributeMap& attrs);
-
   virtual ~ListenAndServOp();
 
   void RunSyncLoop(framework::Executor* executor,
@@ -89,6 +88,8 @@ class ListenAndServOp : public framework::OperatorBase {
   mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
   mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
   mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_get_no_barrier_handler_;
   mutable std::shared_ptr<distributed::RequestHandler>
       request_prefetch_handler_;
   mutable std::shared_ptr<distributed::RequestHandler>
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.h b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
index 99c5759019..05c00251b9 100644
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
@@ -43,9 +43,9 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
                       "the number of Ids and Out should be the same");
 
-    size_t row_ids_size = 0;
-    int row_size = 0;
-    int embedding_size = 0;
+    int64_t row_ids_size = 0;
+    int64_t row_size = 0;
+    int64_t embedding_size = 0;
 
     for (size_t i = 0; i < x_tensors.size(); ++i) {
       const auto *x_tensor = x_tensors[i];
@@ -69,7 +69,7 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < x_tensors.size(); ++i) {
       const auto *row_id = row_ids[i];
 
-      for (int j = 0; j < row_id->numel(); ++j) {
+      for (auto j = 0; j < row_id->numel(); ++j) {
         int64_t key = row_id->data<int64_t>()[j];
         std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
         selected_rows_idx_map.insert(std::make_pair(key, val));
@@ -84,13 +84,13 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
 
       out->set_lod(out_ids->lod());
 
-      int nums = static_cast<int>(out_ids->dims()[0]);
+      auto nums = out_ids->dims()[0];
       auto *out_data = out->mutable_data<T>(
           framework::make_ddim({nums, embedding_size}), place);
-      for (int j = 0; j < nums; ++j) {
-        int id = out_ids->data<int64_t>()[j];
-        auto row_tuple = selected_rows_idx_map[id];
-        int64_t row_idx = std::get<1>(row_tuple);
+      for (auto j = 0; j < nums; ++j) {
+        auto id = out_ids->data<int64_t>()[j];
+        auto row_tuple = selected_rows_idx_map.at(id);
+        auto row_idx = std::get<1>(row_tuple);
         const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
 
         memcpy(out_data + embedding_size * j,
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 48065437e3..120c65f296 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -27,30 +27,50 @@ namespace operators {
 
 class RecvOp : public framework::OperatorBase {
  public:
-  RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto outs = Outputs("Out");
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    std::vector<std::string> varnames =
+        Attr<std::vector<std::string>>("varnames");
     int sync_mode = Attr<int>("sync_mode");
+    auto outs = Outputs("Out");
+    bool with_barrier = Attr<bool>("with_barrier");
 
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
 
-    distributed::RPCClient* rpc_client =
+    distributed::RPCClient *rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(
             Attr<int>("trainer_id"));
 
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
-    }
-    if (sync_mode) {
+    if (with_barrier) {
+      std::vector<distributed::VarHandlePtr> rets;
+      for (size_t i = 0; i < outs.size(); i++) {
+        std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
+        VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
+                << varname << " and with AsyncGetVar";
+        rets.push_back(
+            rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
+      }
+      if (sync_mode) {
+        for (size_t i = 0; i < rets.size(); i++) {
+          PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+        }
+      }
+    } else {
+      std::vector<distributed::VarHandlePtr> rets;
+      for (size_t i = 0; i < outs.size(); i++) {
+        std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
+        VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
+                << varname << " and with AsyncGetVarNoBarrier";
+        rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
+                                                        varname, outs[i]));
+      }
       for (size_t i = 0; i < rets.size(); i++) {
         PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
       }
@@ -79,12 +99,23 @@ This operator can get variables from server side.
                  "(int, default 0)"
                  "sync recv or async recv.")
         .SetDefault(0);
+    AddAttr<bool>("with_barrier",
+                  "(bool, default True) if with_barrier=False, will use "
+                  "AsyncGetVarNoBarrier get variable from pserver immediately")
+        .SetDefault(true);
+    AddAttr<std::vector<std::string>>(
+        "varnames",
+        "(string vector, default {}) "
+        "sometimes we need to put received var in another name "
+        "for example: we need var named 'moment_1@127.0.0.1:1001', "
+        "and it real name on parameter server is 'moment_1'. ")
+        .SetDefault({});
   }
 };
 
 class RecvOpShapeInference : public framework::InferShapeBase {
  public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
+  void operator()(framework::InferShapeContext *ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 7bb6934e14..cb8a4e7e15 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -277,68 +277,6 @@ class TransformFunctor {
   Functor func_;
 };
 
-#define EIGEN_FUNCTOR(name, eigen_op)                                          \
-  struct Eigen##name##Functor {                                                \
-    template <typename DeviceContext, typename T>                              \
-    inline void Run(const framework::Tensor *x, const framework::Tensor *y,    \
-                    framework::Tensor *z,                                      \
-                    const framework::ExecutionContext &ctx) {                  \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_e);                                                  \
-    }                                                                          \
-    template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast(const framework::Tensor *x,                       \
-                             const framework::Tensor *y, framework::Tensor *z, \
-                             const framework::ExecutionContext &ctx, int pre,  \
-                             int n) {                                          \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_bcast);                                              \
-    }                                                                          \
-    template <typename DeviceContext, typename T>                              \
-    inline void RunBroadCast2(const framework::Tensor *x,                      \
-                              const framework::Tensor *y,                      \
-                              framework::Tensor *z,                            \
-                              const framework::ExecutionContext &ctx, int pre, \
-                              int n, int post) {                               \
-      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
-      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
-      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
-      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
-      z_e.device(                                                              \
-          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
-          eigen_op(x_e, y_bcast);                                              \
-    }                                                                          \
-  }
-
-#define EIGEN_ADD(x, y) ((x) + (y))
-
-EIGEN_FUNCTOR(Add, EIGEN_ADD);
-
-#define EIGEN_SUB(x, y) ((x) - (y))
-
-EIGEN_FUNCTOR(Sub, EIGEN_SUB);
-
-#define EIGEN_MUL(x, y) ((x) * (y))
-
-EIGEN_FUNCTOR(Mul, EIGEN_MUL);
-
-#define EIGEN_DIV(x, y) ((x) / (y))
-
-EIGEN_FUNCTOR(Div, EIGEN_DIV);
-
 template <typename T, typename DX_OP, typename DY_OP>
 struct ElemwiseGradNoBroadcast {
   const T *x_;
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 0a8c0814a7..55cef93aac 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -103,8 +103,10 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
                        ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
+                       ops::GatherOpKernel<uint8_t>,
                        ops::GatherOpKernel<int64_t>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<double>,
                        ops::GatherGradientOpKernel<int>,
+                       ops::GatherGradientOpKernel<uint8_t>,
                        ops::GatherGradientOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 14a2524bd8..241184c6f4 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -43,12 +43,14 @@ class GridSampleOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
     PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
                       "Input(X) and Input(Grid) dims[0] should be equal.");
-    PADDLE_ENFORCE_EQ(
-        grid_dims[1], x_dims[2],
-        "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
-    PADDLE_ENFORCE_EQ(
-        grid_dims[2], x_dims[3],
-        "Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(
+          grid_dims[1], x_dims[2],
+          "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
+      PADDLE_ENFORCE_EQ(
+          grid_dims[2], x_dims[3],
+          "Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
+    }
 
     ctx->SetOutputDim("Output", x_dims);
     ctx->ShareLoD("X", "Output");
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 6e460c470b..3bf8586254 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -21,20 +21,20 @@ namespace operators {
 
 enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
 
-#define CHECK_CASE(i, flags, kernel_name, args...)                   \
-  if (i == flags) {                                                  \
-    kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(args); \
+#define CHECK_CASE(i, flags, kernel_name, ...)                              \
+  if (i == flags) {                                                         \
+    kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(__VA_ARGS__); \
   }
 
 // 0 for no scale, no bias
 // 1 for has scale, no bias
 // 2 for no scale, has bias
 // 3 for has scale, has bias
-#define UNROLL_ALL_CASES(flags, kernel_name, args...) \
-  CHECK_CASE(0, flags, kernel_name, args)             \
-  CHECK_CASE(1, flags, kernel_name, args)             \
-  CHECK_CASE(2, flags, kernel_name, args)             \
-  CHECK_CASE(3, flags, kernel_name, args)
+#define UNROLL_ALL_CASES(flags, kernel_name, ...) \
+  CHECK_CASE(0, flags, kernel_name, __VA_ARGS__)  \
+  CHECK_CASE(1, flags, kernel_name, __VA_ARGS__)  \
+  CHECK_CASE(2, flags, kernel_name, __VA_ARGS__)  \
+  CHECK_CASE(3, flags, kernel_name, __VA_ARGS__)
 
 template <typename T>
 __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt
index 262094f922..35775d7ec9 100644
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -21,5 +21,5 @@ endif()
 cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
 cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
 if(NOT WIN32)
-    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer)
+    cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer tensor)
 endif()
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 74d6a87247..186c37c56e 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -18,6 +18,7 @@
 #include <vector>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/place.h"
@@ -155,14 +156,22 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
   LOG(INFO) << loginfos.str();
 }
 
+using Tensor = paddle::framework::Tensor;
+
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void BenchXYZNKernel() {
   for (int d : TestSizes()) {
-    std::vector<T> x(d), y(d), z(d);
-    RandomVec<T>(d, x.data());
-    RandomVec<T>(d, y.data());
-    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data(), y.data(),
-                                                     z.data(), d);
+    Tensor x, y, z;
+    x.Resize({d});
+    y.Resize({d});
+    z.Resize({d});
+    T* x_data = x.mutable_data<T>(PlaceType());
+    T* y_data = y.mutable_data<T>(PlaceType());
+    T* z_data = z.mutable_data<T>(PlaceType());
+    RandomVec<T>(d, x_data);
+    RandomVec<T>(d, y_data);
+    BenchAllImpls<KT, jit::XYZNTuples<T>, PlaceType>(d, x.data<T>(),
+                                                     y.data<T>(), z_data, d);
   }
 }
 
@@ -170,9 +179,13 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void BenchAXYNKernel() {
   for (int d : TestSizes()) {
     const T a = static_cast<T>(3);
-    std::vector<T> x(d), y(d);
-    RandomVec<T>(d, x.data());
-    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data(), y.data(),
+    Tensor x, y;
+    x.Resize({d});
+    y.Resize({d});
+    T* x_data = x.mutable_data<T>(PlaceType());
+    T* y_data = y.mutable_data<T>(PlaceType());
+    RandomVec<T>(d, x_data);
+    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data,
                                                      d);
   }
 }
@@ -180,9 +193,13 @@ void BenchAXYNKernel() {
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void BenchXYNKernel() {
   for (int d : TestSizes()) {
-    std::vector<T> x(d), y(d);
-    RandomVec<T>(d, x.data());
-    BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data(), y.data(), d);
+    Tensor x, y;
+    x.Resize({d});
+    y.Resize({d});
+    T* x_data = x.mutable_data<T>(PlaceType());
+    T* y_data = y.mutable_data<T>(PlaceType());
+    RandomVec<T>(d, x_data);
+    BenchAllImpls<KT, jit::XYNTuples<T>, PlaceType>(d, x.data<T>(), y_data, d);
   }
 }
 
@@ -192,16 +209,23 @@ void BenchLSTMKernel() {
     for (int d : TestSizes()) {
       const jit::lstm_attr_t attr(d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh,
                                   use_peephole);
-      std::vector<T> x(4 * d), ct_1(d), ct(d), ht(d), wp(3 * d), checked(2 * d);
-      RandomVec<T>(4 * d, x.data(), -2.f, 2.f);
-      RandomVec<T>(3 * d, wp.data(), -2.f, 2.f);
-      RandomVec<T>(d, ct_1.data(), -2.f, 2.f);
-      const T* ct_1_data = ct_1.data();
-      const T* wp_data = wp.data();
-      T* x_data = x.data();
-      T* checked_data = checked.data();
-      T* ct_data = ct.data();
-      T* ht_data = ht.data();
+      Tensor x, ct_1, ct, ht, wp, checked;
+      x.Resize({4 * d});
+      ct_1.Resize({d});
+      ct.Resize({d});
+      ht.Resize({d});
+      wp.Resize({3 * d});
+      checked.Resize({2 * d});
+      auto place = PlaceType();
+      RandomVec<T>(x.numel(), x.mutable_data<T>(place), -2.f, 2.f);
+      RandomVec<T>(wp.numel(), wp.mutable_data<T>(place), -2.f, 2.f);
+      RandomVec<T>(ct_1.numel(), ct_1.mutable_data<T>(place), -2.f, 2.f);
+      const T* ct_1_data = ct_1.data<T>();
+      const T* wp_data = wp.data<T>();
+      T* x_data = x.mutable_data<T>(place);
+      T* checked_data = checked.mutable_data<T>(place);
+      T* ct_data = ct.mutable_data<T>(place);
+      T* ht_data = ht.mutable_data<T>(place);
       jit::lstm_t step;
       step.gates = x_data;
       step.ct_1 = ct_1_data;
@@ -220,12 +244,16 @@ template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void BenchGRUKernel() {
   for (int d : TestSizes()) {
     const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
-    std::vector<T> x(3 * d), ht_1(d), ht(d);
-    RandomVec<T>(3 * d, x.data(), -2.f, 2.f);
-    RandomVec<T>(d, ht_1.data(), -2.f, 2.f);
-    const T* ht_1_data = ht_1.data();
-    T* x_data = x.data();
-    T* ht_data = ht.data();
+    auto place = PlaceType();
+    Tensor x, ht_1, ht;
+    x.Resize({3 * d});
+    ht_1.Resize({d});
+    ht.Resize({d});
+    RandomVec<T>(3 * d, x.mutable_data<T>(place), -2.f, 2.f);
+    RandomVec<T>(d, ht_1.mutable_data<T>(place), -2.f, 2.f);
+    const T* ht_1_data = ht_1.data<T>();
+    T* x_data = x.mutable_data<T>(place);
+    T* ht_data = ht.mutable_data<T>(place);
     jit::gru_t step;
     step.gates = x_data;
     step.ht_1 = ht_1_data;
@@ -243,10 +271,12 @@ void BenchSeqPoolKernel() {
       jit::seq_pool_attr_t attr(w, type);
       for (int h : TestSizes()) {
         attr.h = h;
-        std::vector<T> x(h * w), y(w);
-        RandomVec<T>(h * w, x.data(), -2.f, 2.f);
-        const T* x_data = x.data();
-        T* y_data = y.data();
+        Tensor x, y;
+        x.Resize({h * w});
+        y.Resize({w});
+        RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        const T* x_data = x.data<T>();
+        T* y_data = y.mutable_data<T>(PlaceType());
         BenchAllImpls<KT, jit::SeqPoolTuples<T>, PlaceType>(attr, x_data,
                                                             y_data, &attr);
       }
@@ -259,12 +289,15 @@ void BenchMatMulKernel() {
   for (int m : {1, 2, 3, 4}) {
     for (int n : TestSizes()) {
       for (int k : TestSizes()) {
-        std::vector<T> a(m * k), b(k * n), c(m * n);
-        RandomVec<T>(m * k, a.data(), -2.f, 2.f);
-        RandomVec<T>(k * n, b.data(), -2.f, 2.f);
-        const T* a_data = a.data();
-        const T* b_data = b.data();
-        T* c_data = c.data();
+        Tensor a, b, c;
+        a.Resize({m * k});
+        b.Resize({k * n});
+        c.Resize({m * n});
+        RandomVec<T>(m * k, a.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        RandomVec<T>(k * n, b.mutable_data<T>(PlaceType()), -2.f, 2.f);
+        const T* a_data = a.data<T>();
+        const T* b_data = b.data<T>();
+        T* c_data = c.mutable_data<T>(PlaceType());
         BenchAllImpls<KT, jit::MatMulTuples<T>, PlaceType>(k, a_data, b_data,
                                                            c_data, m, n, k);
       }
diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
index 4e4f977fcc..097ba01d40 100644
--- a/paddle/fluid/operators/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -67,7 +67,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     mid->mutable_data<T>(ctx.GetPlace());
 
     const int n = ctx.Attr<int>("n");
-    const float alpha = ctx.Attr<float>("alpha");
+    // MKL-DNN implements LRN in a caffe way:
+    // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html
+    // Where sum of squares is divided by size of normalization window
+    // this is not the case for PaddlePaddle LRN.
+    // Hence we need to compensate for this diffrence by
+    // multipliing alpha by size of window(n)
+    const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
     const float beta = ctx.Attr<float>("beta");
     const float k = ctx.Attr<float>("k");
     const bool is_test = ctx.Attr<bool>("is_test");
@@ -78,10 +84,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dims = paddle::framework::vectorize2int(x->dims());
 
     auto src_md = paddle::platform::MKLDNNMemDesc(
-        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-
-    auto dst_md = paddle::platform::MKLDNNMemDesc(
-        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+        dims, mkldnn::memory::data_type::f32, x->format());
 
     auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
                                                   mkldnn::lrn_across_channels,
@@ -92,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                                   k};
 
     auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
-    auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
-                                     static_cast<void*>(output_data)};
 
     if (!is_test) {
       const std::string key = ctx.op().Output("Out");
@@ -110,11 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       src_memory->set_data_handle(
           static_cast<void*>(const_cast<T*>(input_data)));
 
+      auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(),
+                                       static_cast<void*>(output_data));
       auto workspace_memory = insert_to_context<mkldnn::memory>(
           key_workspace_memory, dev_ctx,
           forward_pd->workspace_primitive_desc());
 
       run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
+
+      out->set_layout(framework::DataLayout::kMKLDNN);
+      out->set_format(platform::GetMKLDNNFormat(dst_memory));
     } else {
       auto forward_pd =
           mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
@@ -122,8 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
       auto workspace_memory =
           mkldnn::memory{forward_pd.workspace_primitive_desc()};
+      auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(),
+                                       static_cast<void*>(output_data));
 
       run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
+
+      out->set_layout(framework::DataLayout::kMKLDNN);
+      out->set_format(platform::GetMKLDNNFormat(dst_memory));
     }
   }
 };
@@ -151,7 +162,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key_workspace_memory = key + "@lrn_workspace_memory";
 
     const int n = ctx.Attr<int>("n");
-    const float alpha = ctx.Attr<float>("alpha");
+    const float alpha = ctx.Attr<float>("alpha") * static_cast<float>(n);
     const float beta = ctx.Attr<float>("beta");
     const float k = ctx.Attr<float>("k");
 
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index dc27e543f0..6bbb7155dd 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -54,6 +54,7 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+math_library(beam_search DEPS math_function)
 
 math_library(matrix_bit_code)
 
@@ -68,6 +69,7 @@ cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
 cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
+cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
 if(WITH_GPU)
     nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
new file mode 100644
index 0000000000..fb7119273a
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -0,0 +1,283 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/beam_search.h"
+#include <algorithm>
+#include <map>
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class BeamSearchFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext &context,
+                  const framework::LoDTensor *pre_ids,
+                  const framework::LoDTensor *pre_scores,
+                  const framework::LoDTensor *ids,
+                  const framework::LoDTensor *scores,
+                  framework::LoDTensor *selected_ids,
+                  framework::LoDTensor *selected_scores, size_t level,
+                  size_t beam_size, int end_id, bool is_accumulated) {
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+    auto &high_level = abs_lod[level];
+
+    auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level,
+                                        beam_size, end_id, is_accumulated);
+    auto selected_items = ToMap(items, high_level.back());
+    if (FLAGS_v == 3) {
+      VLOG(3) << "selected_items:";
+      for (size_t i = 0; i < selected_items.size(); ++i) {
+        VLOG(3) << "offset: " << i;
+        for (auto &item : selected_items[i]) {
+          VLOG(3) << item.ToString();
+        }
+      }
+    }
+
+    PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
+    // calculate the output tensor's height
+    size_t num_instances = std::accumulate(
+        std::begin(selected_items), std::end(selected_items), 0,
+        [](size_t a, std::vector<Item> &b) { return a + b.size(); });
+    // the output tensor shape should be [num_instances, 1]
+    auto dims = framework::make_ddim(
+        std::vector<int64_t>({static_cast<int>(num_instances), 1}));
+    selected_ids->Resize(dims);
+    selected_scores->Resize(dims);
+
+    auto *selected_ids_data =
+        selected_ids->mutable_data<int64_t>(platform::CPUPlace());
+    auto *selected_scores_data =
+        selected_scores->mutable_data<float>(platform::CPUPlace());
+
+    // fill in data
+    std::vector<size_t> low_level;
+    size_t low_offset = 0;
+    for (auto &items : selected_items) {
+      low_level.push_back(low_offset);
+      for (auto &item : items) {
+        selected_ids_data[low_offset] = item.id;
+        selected_scores_data[low_offset] = item.score;
+        low_offset++;
+      }
+    }
+    low_level.push_back(low_offset);
+
+    // fill lod
+    framework::LoD lod(2);
+    lod[0].assign(high_level.begin(), high_level.end());
+    lod[1].assign(low_level.begin(), low_level.end());
+    if (!framework::CheckLoD(lod)) {
+      PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+    }
+    selected_ids->set_lod(lod);
+    selected_scores->set_lod(lod);
+  }
+
+  /*
+   * The basic items help to sort.
+   */
+  struct Item {
+    Item() {}
+    Item(size_t offset, size_t id, float score)
+        : offset(offset), id(id), score(score) {}
+    // offset in the higher lod level.
+    size_t offset;
+    // prefix id in the lower lod level.
+    // size_t prefix;
+    // the candidate id
+    size_t id;
+    // the corresponding score
+    float score;
+
+    inline bool operator<(const Item &in) const {
+      return (score < in.score) ||
+             ((score == in.score) && (offset < in.offset));
+    }
+
+    inline void operator=(const Item &in) {
+      offset = in.offset;
+      id = in.id;
+      score = in.score;
+    }
+
+    std::string ToString() {
+      std::ostringstream os;
+      os << "{";
+      os << "offset: " << offset << ", ";
+      os << "id: " << id << ", ";
+      os << "score: " << score << "";
+      os << "}";
+      return os.str();
+    }
+  };
+
+ protected:
+  /*
+   * Prune the source sentences all branchs finished, and it is optional.
+   * Pruning must one step later than finishing (thus pre_ids is needed here),
+   * since the end tokens must be writed out.
+   */
+  void PruneEndBeams(const framework::LoDTensor *pre_ids,
+                     const framework::LoD &abs_lod,
+                     std::vector<std::vector<Item>> *items, size_t lod_level,
+                     int end_id) {
+    auto *pre_ids_data = pre_ids->data<int64_t>();
+    auto &high_level = abs_lod[lod_level];
+    for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
+      size_t src_prefix_start = high_level[src_idx];
+      size_t src_prefix_end = high_level[src_idx + 1];
+      bool finish_flag = true;
+      for (size_t offset = src_prefix_start; offset < src_prefix_end;
+           offset++) {
+        for (auto &item : items->at(offset)) {
+          if (item.id != static_cast<size_t>(end_id) ||
+              pre_ids_data[offset] != end_id) {
+            finish_flag = false;
+            break;
+          }
+        }
+        if (!finish_flag) break;
+      }
+      if (finish_flag) {  // all branchs of the beam (source sentence) end and
+                          // prune this beam
+        for (size_t offset = src_prefix_start; offset < src_prefix_end;
+             offset++)
+          items->at(offset).clear();
+      }
+    }
+  }
+
+  /*
+   * Transform the items into a map whose key is offset, value is the items.
+   * NOTE low performance.
+   */
+  std::vector<std::vector<Item>> ToMap(
+      const std::vector<std::vector<Item>> &items, size_t element_num) {
+    std::vector<std::vector<Item>> result;
+    result.resize(element_num);
+    for (auto &entries : items) {
+      for (const auto &item : entries) {
+        result[item.offset].push_back(item);
+      }
+    }
+    return result;
+  }
+
+  void Insert(std::vector<Item> *top_beam_ptr, const Item &item,
+              size_t beam_size) {
+    std::vector<Item> &top_beam = *top_beam_ptr;
+
+    size_t num_beams = top_beam.size();
+    if (num_beams < beam_size) {
+      top_beam.resize(num_beams + 1);
+      num_beams++;
+    } else {
+      if (item < top_beam[beam_size - 1]) {
+        return;
+      }
+    }
+
+    for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
+      if (top_beam[k] < item) {
+        top_beam[k + 1] = top_beam[k];
+      } else {
+        top_beam[k + 1] = item;
+        return;
+      }
+    }
+    top_beam[0] = item;
+  }
+
+  /*
+   * For each source, select top beam_size records.
+   */
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
+      const framework::LoDTensor *pre_ids,
+      const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids,
+      const framework::LoDTensor *scores, size_t lod_level, size_t beam_size,
+      int end_id, bool is_accumulated) {
+    std::vector<std::vector<Item>> result;
+
+    // find the current candidates
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+
+    auto *pre_ids_data = pre_ids->data<int64_t>();
+    auto *pre_scores_data = pre_scores->data<float>();
+
+    auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
+    auto *scores_data = scores->data<float>();
+
+    size_t num_seqs = scores->NumElements(lod_level);
+    size_t seq_width = 1;
+    for (int i = 1; i < scores->dims().size(); i++) {
+      seq_width *= scores->dims()[i];
+    }
+
+    for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
+      size_t seq_offset_start = abs_lod[lod_level][seq_id];
+      size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
+
+      std::vector<Item> top_beam;
+      top_beam.reserve(beam_size);
+
+      for (size_t offset = seq_offset_start; offset < seq_offset_end;
+           ++offset) {
+        auto pre_id = pre_ids_data[offset];
+        auto pre_score = pre_scores_data[offset];
+        if (pre_id == end_id) {
+          // Allocate all probability mass to end_id for finished branchs and
+          // the other candidate ids can be ignored.
+          Item item(offset, end_id, pre_score);
+          Insert(&top_beam, item, beam_size);
+        } else {
+          size_t index = offset * seq_width;
+          for (size_t d = 0; d < seq_width; d++, index++) {
+            int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
+            float score = is_accumulated
+                              ? scores_data[index]
+                              : pre_score + std::log(scores_data[index]);
+            Item item(offset, id, score);
+            Insert(&top_beam, item, beam_size);
+          }
+        }
+      }
+
+      result.emplace_back(top_beam);
+    }
+
+    if (FLAGS_v == 3) {
+      VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+      for (auto &items : result) {
+        VLOG(3) << "item set:";
+        for (auto &item : items) {
+          VLOG(3) << item.ToString();
+        }
+      }
+    }
+
+    return result;
+  }
+};
+
+template class BeamSearchFunctor<platform::CPUDeviceContext, int>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, int64_t>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, float>;
+template class BeamSearchFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
new file mode 100644
index 0000000000..d94e3023ce
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -0,0 +1,393 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/beam_search.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+struct Triple {
+  __device__ __forceinline__ Triple() {}
+  __device__ __forceinline__ Triple(int o, int i, float s)
+      : offset(o), id(i), score(s) {}
+
+  __device__ __forceinline__ void set(int o, int i, float s) {
+    offset = o;
+    id = i;
+    score = s;
+  }
+
+  __device__ __forceinline__ void operator=(const Triple& in) {
+    offset = in.offset;
+    id = in.id;
+    score = in.score;
+  }
+
+  __device__ __forceinline__ bool operator<(const float s) const {
+    return score < s;
+  }
+
+  __device__ __forceinline__ bool operator<(const Triple& in) const {
+    return (score < in.score) || ((score == in.score) && (offset < in.offset));
+  }
+
+  int offset;
+  int id;
+  float score;
+};
+
+__device__ __forceinline__ void Insert(Triple* top_beam, const Triple& p,
+                                       int beam_size) {
+  if (p < top_beam[beam_size - 1]) {
+    return;
+  }
+  for (int k = beam_size - 2; k >= 0; --k) {
+    if (top_beam[k] < p) {
+      top_beam[k + 1] = top_beam[k];
+    } else {
+      top_beam[k + 1] = p;
+      return;
+    }
+  }
+  top_beam[0] = p;
+}
+
+template <int MaxThreadsPerSeq, bool IsAccumulated = true>
+__device__ __forceinline__ int SelectTopBeam(
+    Triple* top_beam, const int64_t* pre_ids, const float* pre_scores,
+    const int64_t* ids, const float* scores, const int seq_offset_start,
+    const int seq_offset_end, const int seq_width, int beam_size, int end_id,
+    int used_threads) {
+  // top_beam is shared memory
+  const int tid = threadIdx.x;
+  const int tid_of_seq = threadIdx.x % MaxThreadsPerSeq;
+
+  int num_used_threads = used_threads;
+
+  Triple* top_beam_local = top_beam + tid * beam_size;
+  if (tid_of_seq < num_used_threads) {
+    for (int i = 0; i < beam_size; ++i) {
+      top_beam_local[i].set(-1, -1, -INFINITY);
+    }
+
+    for (int offset = seq_offset_start; offset < seq_offset_end; ++offset) {
+      int pre_id = static_cast<int>(pre_ids[offset]);
+      if (pre_id == end_id) {
+        if (tid_of_seq == 0) {
+          Triple tmp(offset, end_id, pre_scores[offset]);
+          Insert(top_beam_local, tmp, beam_size);
+        }
+      } else {
+        int index = offset * seq_width + tid_of_seq;
+        if (!IsAccumulated) {
+          float pre_score = pre_scores[offset];
+          for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
+            float score = pre_score + __logf(scores[index]);
+            int id = ids ? static_cast<int>(ids[index]) : i;
+            Triple tmp(offset, id, score);
+            Insert(top_beam_local, tmp, beam_size);
+            index += num_used_threads;
+          }
+        } else {
+          for (int i = tid_of_seq; i < seq_width; i += num_used_threads) {
+            int id = ids ? static_cast<int>(ids[index]) : i;
+            float score = scores[index];
+            Triple tmp(offset, id, score);
+            Insert(top_beam_local, tmp, beam_size);
+            index += num_used_threads;
+          }
+        }
+      }
+    }
+  }
+
+  while (num_used_threads > 1) {
+    if (num_used_threads > 16) {
+      __syncthreads();
+    }
+
+    num_used_threads = num_used_threads >> 1;
+    if (tid_of_seq < num_used_threads) {
+      int index_in_sh = (num_used_threads + tid) * beam_size;
+      for (int i = 0; i < beam_size; i++) {
+        Insert(top_beam_local, top_beam[index_in_sh], beam_size);
+        index_in_sh++;
+      }
+    }
+  }
+
+  if (tid_of_seq == 0) {
+    int num_items = 0;
+    for (int i = 0; i < beam_size; ++i) {
+      num_items =
+          (top_beam_local[i].score > -INFINITY) ? num_items + 1 : num_items;
+    }
+    return num_items;
+  }
+
+  return 0;
+}
+
+__device__ __forceinline__ bool PruneEndBeams(Triple* top_beam_local,
+                                              const int64_t* pre_ids,
+                                              const int end_id, int num_items) {
+  bool finish_flag = true;
+  for (int i = 0; i < num_items; ++i) {
+    int offset = top_beam_local[i].offset;
+    if (top_beam_local[i].id != end_id ||
+        static_cast<int>(pre_ids[offset]) != end_id) {
+      finish_flag = false;
+      break;
+    }
+  }
+  return finish_flag;
+}
+
+__device__ __forceinline__ void WriteBack(
+    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
+    Triple* top_beam_local, const int seq_offset_start,
+    const int seq_offset_end, const int selected_seq_start,
+    const int selected_seq_length) {
+  const int tid = threadIdx.x;  // use 1 thread only for each sequence
+  int global_index = selected_seq_start;
+  for (int global_offset = seq_offset_start; global_offset < seq_offset_end;
+       ++global_offset) {
+    for (int local_index = 0; local_index < selected_seq_length;
+         ++local_index) {
+      if (top_beam_local[local_index].offset == global_offset) {
+        selected_ids[global_index] =
+            static_cast<int64_t>(top_beam_local[local_index].id);
+        selected_scores[global_index] = top_beam_local[local_index].score;
+        global_index++;
+      }
+    }
+    selected_offsets[global_offset + 1] = static_cast<size_t>(global_index);
+  }
+}
+
+template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
+__device__ void BeamSearchDetails(
+    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
+    const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
+    const float* scores, const int seq_offset_start, const int seq_offset_end,
+    const int seq_width, int beam_size, int end_id, bool is_accumulated,
+    int num_used_threads) {
+  __shared__ Triple top_beam[MaxLength];
+
+  int num_items = 0;
+  if (is_accumulated) {
+    num_items = SelectTopBeam<MaxThreadsPerSeq, true>(
+        top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
+        seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
+  } else {
+    num_items = SelectTopBeam<MaxThreadsPerSeq, false>(
+        top_beam, pre_ids, pre_scores, ids, scores, seq_offset_start,
+        seq_offset_end, seq_width, beam_size, end_id, num_used_threads);
+  }
+
+  const int tid = threadIdx.x;  // use 1 thread only for each sequence
+  const int tid_of_seq = tid % MaxThreadsPerSeq;
+  if (tid_of_seq == 0) {
+    // Use 1 thread for each sequence.
+    Triple* top_beam_local = top_beam + tid * beam_size;
+    bool finish_flag =
+        PruneEndBeams(top_beam_local, pre_ids, end_id, num_items);
+
+    int selected_seq_start = 0;
+    int selected_seq_length = finish_flag ? 0 : num_items;
+
+    if (MaxSeqs > 1) {
+      const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
+      __shared__ int shared_mem[MaxSeqs];
+
+      // [0, MaxSeqs - 1], length of each sequences
+      shared_mem[seq_id] = selected_seq_length;
+      __syncthreads();
+
+      for (int s = 0; s < seq_id; ++s) {
+        selected_seq_start += shared_mem[s];
+      }
+
+      if (seq_id == 0) {
+        selected_offsets[0] = 0;
+      }
+    } else {
+      selected_offsets[0] = 0;
+    }
+
+    WriteBack(selected_ids, selected_scores, selected_offsets, top_beam_local,
+              seq_offset_start, seq_offset_end, selected_seq_start,
+              selected_seq_length);
+  }
+}
+
+template <int MaxLength, int MaxThreadsPerSeq, int MaxSeqs>
+__global__ void BeamSearchKernel(int64_t* selected_ids, float* selected_scores,
+                                 size_t* selected_offsets,
+                                 const int64_t* pre_ids,
+                                 const float* pre_scores, const int64_t* ids,
+                                 const float* scores, const size_t* seq_offsets,
+                                 const int num_seqs, const int seq_width,
+                                 int beam_size, int end_id, bool is_accumulated,
+                                 int num_used_threads) {
+  const int tid = threadIdx.x;
+  const int seq_id = (MaxSeqs > 1) ? tid / MaxThreadsPerSeq : tid;
+
+  int seq_offset_start = static_cast<int>(seq_offsets[seq_id]);
+  int seq_offset_end = static_cast<int>(seq_offsets[seq_id + 1]);
+
+  BeamSearchDetails<MaxLength, MaxThreadsPerSeq, MaxSeqs>(
+      selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
+      scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
+      is_accumulated, num_used_threads);
+}
+
+template <int MaxLength, int MaxThreadsPerSeq>
+__global__ void BeamSearchKernelSingle(
+    int64_t* selected_ids, float* selected_scores, size_t* selected_offsets,
+    const int64_t* pre_ids, const float* pre_scores, const int64_t* ids,
+    const float* scores, const int seq_length, const int seq_width,
+    int beam_size, int end_id, bool is_accumulated, int num_used_threads) {
+  const int seq_offset_start = 0;
+  const int seq_offset_end = seq_length;
+
+  BeamSearchDetails<MaxLength, MaxThreadsPerSeq, 1>(
+      selected_ids, selected_scores, selected_offsets, pre_ids, pre_scores, ids,
+      scores, seq_offset_start, seq_offset_end, seq_width, beam_size, end_id,
+      is_accumulated, num_used_threads);
+}
+
+static inline int GetNumUsedThreads(const int max_threads_per_seq,
+                                    const int seq_width, int beam_size) {
+  int num_used_threads = (seq_width + beam_size - 1) / beam_size;
+  num_used_threads = max_threads_per_seq < num_used_threads
+                         ? max_threads_per_seq
+                         : num_used_threads;
+
+  num_used_threads =
+      num_used_threads > 32
+          ? (num_used_threads >> 5) << 5
+          : (num_used_threads > 16
+                 ? 32
+                 : (num_used_threads > 8
+                        ? 16
+                        : (num_used_threads > 4
+                               ? 8
+                               : (num_used_threads > 2 ? 4
+                                                       : num_used_threads))));
+  return num_used_threads;
+}
+
+template <typename T>
+class BeamSearchFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::LoDTensor* pre_ids,
+                  const framework::LoDTensor* pre_scores,
+                  const framework::LoDTensor* ids,
+                  const framework::LoDTensor* scores,
+                  framework::LoDTensor* selected_ids,
+                  framework::LoDTensor* selected_scores, size_t level,
+                  size_t beam_size, int end_id, bool is_accumulated) {
+    auto abs_lod = framework::ToAbsOffset(scores->lod());
+
+    const int64_t* pre_ids_data = pre_ids->data<int64_t>();
+    const float* pre_scores_data = pre_scores->data<float>();
+    const int64_t* ids_data = ids ? ids->data<int64_t>() : nullptr;
+    const float* scores_data = scores->data<float>();
+
+    const size_t num_seqs = abs_lod[level].size() - 1;
+    size_t seq_width = 1;
+    for (int i = 1; i < scores->dims().size(); i++) {
+      seq_width *= scores->dims()[i];
+    }
+
+    // Reserve a big enough memory.
+    auto selected_dims =
+        framework::make_ddim({static_cast<int64_t>(num_seqs * beam_size), 1});
+    int64_t* selected_ids_data =
+        selected_ids->mutable_data<int64_t>(selected_dims, context.GetPlace());
+    float* selected_scores_data =
+        selected_scores->mutable_data<float>(selected_dims, context.GetPlace());
+
+    framework::LoD selected_lod(2);
+    selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end());
+    selected_lod[1].resize(scores->dims()[0] + 1);
+    size_t* selected_offsets =
+        selected_lod[1].CUDAMutableData(context.GetPlace());
+
+    if (num_seqs == 1) {
+      const int seq_length = static_cast<int>(abs_lod[level][1]);
+      const int kMaxThreadsPerSeq = 1024;
+      int num_used_threads =
+          GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
+                            static_cast<int>(beam_size));
+      switch (platform::RoundToPowerOfTwo(beam_size * seq_width)) {
+        CUDA_LAUNCH_KERNEL_HELPER(
+            BeamSearchKernelSingle<kPowerOfTwoDim, kMaxThreadsPerSeq><<<
+                1, kMaxThreadsPerSeq, 0, context.stream()>>>(
+                selected_ids_data, selected_scores_data, selected_offsets,
+                pre_ids_data, pre_scores_data, ids_data, scores_data,
+                seq_length, static_cast<int>(seq_width),
+                static_cast<int>(beam_size), static_cast<int>(end_id),
+                is_accumulated, num_used_threads));
+      }
+    } else if (num_seqs <= 4) {
+      const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace());
+      // Use only 1 block
+      const int kMaxThreadsPerSeq = 32;
+      const int kMaxSeqs = 4;
+      int num_used_threads =
+          GetNumUsedThreads(kMaxThreadsPerSeq, static_cast<int>(seq_width),
+                            static_cast<int>(beam_size));
+      switch (platform::RoundToPowerOfTwo(beam_size * num_seqs * 32)) {
+        CUDA_LAUNCH_KERNEL_HELPER(
+            BeamSearchKernel<kPowerOfTwoDim, kMaxThreadsPerSeq, kMaxSeqs><<<
+                1, num_seqs * kMaxThreadsPerSeq, 0, context.stream()>>>(
+                selected_ids_data, selected_scores_data, selected_offsets,
+                pre_ids_data, pre_scores_data, ids_data, scores_data,
+                seq_offsets, static_cast<int>(num_seqs),
+                static_cast<int>(seq_width), static_cast<int>(beam_size),
+                end_id, is_accumulated, num_used_threads));
+      }
+    } else {
+      LOG(FATAL) << "Not implemented.";
+    }
+
+    context.Wait();
+    if (!framework::CheckLoD(selected_lod)) {
+      PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
+    }
+
+    selected_ids->set_lod(selected_lod);
+    selected_scores->set_lod(selected_lod);
+    if (selected_lod[1].back() < num_seqs * beam_size) {
+      auto final_selected_dims = framework::make_ddim(
+          {static_cast<int64_t>(selected_lod[1].back()), 1});
+      selected_ids->Resize(final_selected_dims);
+      selected_scores->Resize(final_selected_dims);
+    }
+  }
+};
+
+template class BeamSearchFunctor<platform::CUDADeviceContext, int>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, int64_t>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, float>;
+template class BeamSearchFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.h b/paddle/fluid/operators/math/beam_search.h
new file mode 100644
index 0000000000..3cd17f426c
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * This is an implementation of beam search.
+ *
+ * To explain the details, lets take machine translation task for example, in
+ * this task, one source sentence is translated to multiple target sentences,
+ * during this period, one sentence will be translated to multiple translation
+ * prefixes(target sentence that have not ended), in each time step a prefix
+ * will have some candidates, input the candidate ids and their corresponding
+ * scores (probabilities), it will sort and select the top beam_size candidates
+ * for each source sentence, and store the selected candidates's score and their
+ * corresponding ids to LoDTensors.
+ *
+ * A detailed example:
+ *
+ *  Input
+ *
+ *    ids:
+ *      - LoD (should have 2 levels)
+ *        - first level: [0, 1, 4]
+ *        - second level: [0, 1, 2, 3, 4]
+ *      - tensor's data:
+ *          [[4, 2, 5]
+ *           [2, 1, 3]
+ *           [3, 5, 2]
+ *           [8, 2, 1]]
+ *
+ *    scores:
+ *      - LoD same as `ids`
+ *      - tensor's data
+ *          [[0.5, 0.3, 0.2]
+ *           [0.6, 0.3, 0.1]
+ *           [0.9, 0.5, 0.1]
+ *           [0.7, 0.5, 0.1]]
+ *
+ * The inputs means that there are 2 source sentences to translate, and the
+ * first source has 1 prefix, the second source has 2 prefix.
+ *
+ * Lets assume beam size is 2, and the beam search's output should be
+ *      - LoD
+ *        - first level: [0, 1, 2]
+ *        - second level: [0, 2, 4]
+ *      - id tensor's data
+ *          [[4,
+ *            1,
+ *            3,
+ *            8]]
+ *      - score tensor's data
+ *          [[0.5,
+ *            0.3,
+ *            0.9,
+ *            0.7]]
+ *
+ * TODO all the prune operations should be in the beam search, so it is better
+ * to split the beam search algorithm into a sequence of smaller operators, and
+ * the prune operators can be inserted in this sequence.
+ */
+template <typename DeviceContext, typename T>
+class BeamSearchFunctor {
+ public:
+  /*
+   * The main function of beam search.
+   *
+   * @selected_ids: a [None, 1]-shaped tensor with LoD.
+   *   In a machine translation model, it might be the candidate term id sets,
+   *   each set stored as a varience-length sequence.
+   *   The format might be described with a two-level LoD
+   *   - [[0 1],
+   *      [0 1 2]]
+   *   - [[]
+   *      [0 1]]
+   *   the first level of LoD tells that there are two source sentences. The
+   *   second level describes the details of the candidate id set's offsets in
+   * the source sentences.
+   *
+   *  @selected_scores: a LoD tensor with the same shape and LoD with
+   * selected_ids.
+   *   It stores the corresponding scores of candidate ids in selected_ids.
+   *
+   * Return false if all the input tensor is empty, in machine translation task
+   * that means no candidates is provided, and the task will stop running.
+   */
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor* pre_ids,
+                  const framework::LoDTensor* pre_scores,
+                  const framework::LoDTensor* ids,
+                  const framework::LoDTensor* scores,
+                  framework::LoDTensor* selected_ids,
+                  framework::LoDTensor* selected_scores, size_t level,
+                  size_t beam_size, int end_id, bool is_accumulated);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
new file mode 100644
index 0000000000..1c29ee95f6
--- /dev/null
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/beam_search.h"
+#include <gtest/gtest.h>
+#include <vector>
+
+void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
+                       paddle::framework::LoDTensor* scores,
+                       paddle::framework::LoDTensor* pre_ids,
+                       paddle::framework::LoDTensor* pre_scores) {
+  // lod
+  paddle::framework::LoD lod;
+  std::vector<size_t> level0({0, 2, 4});
+  std::vector<size_t> level1({0, 1, 2, 3, 4});
+  lod.push_back(level0);
+  lod.push_back(level1);
+  ids->set_lod(lod);
+  scores->set_lod(lod);
+
+  auto dims = paddle::framework::make_ddim({4, 3});
+  ids->Resize(dims);
+  scores->Resize(dims);
+
+  paddle::platform::CPUPlace place;
+  auto* ids_data = ids->mutable_data<int64_t>(place);
+  auto* scores_data = scores->mutable_data<float>(place);
+  std::vector<int64_t> ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
+  std::vector<float> scores_vec_data(
+      {0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
+
+  CHECK_EQ(static_cast<size_t>(ids->numel()), ids_vec_data.size());
+  CHECK_EQ(static_cast<size_t>(ids->numel()), scores_vec_data.size());
+
+  for (int i = 0; i < ids->numel(); i++) {
+    ids_data[i] = ids_vec_data[i];
+    scores_data[i] = scores_vec_data[i];
+  }
+
+  // pre_ids
+  pre_ids->Resize(paddle::framework::make_ddim({4, 1}));
+  for (int i = 0; i < 4; i++) {
+    pre_ids->mutable_data<int64_t>(place)[i] = i + 1;
+  }
+
+  // pre_scores
+  pre_scores->Resize(paddle::framework::make_ddim({4, 1}));
+  for (int i = 0; i < 4; i++) {
+    pre_scores->mutable_data<float>(place)[i] = 0.1 * (i + 1);
+  }
+}
+
+template <typename DeviceContext, typename Place>
+void TestBeamSearch() {
+  paddle::framework::LoDTensor ids;
+  paddle::framework::LoDTensor scores;
+  paddle::framework::LoDTensor pre_ids;
+  paddle::framework::LoDTensor pre_scores;
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
+  } else {
+    paddle::framework::LoDTensor cpu_ids;
+    paddle::framework::LoDTensor cpu_scores;
+    paddle::framework::LoDTensor cpu_pre_ids;
+    paddle::framework::LoDTensor cpu_pre_scores;
+
+    PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
+
+    TensorCopySync(cpu_ids, *place, &ids);
+    TensorCopySync(cpu_scores, *place, &scores);
+    TensorCopySync(cpu_pre_ids, *place, &pre_ids);
+    TensorCopySync(cpu_pre_scores, *place, &pre_scores);
+
+    ids.set_lod(cpu_ids.lod());
+    scores.set_lod(cpu_scores.lod());
+    pre_ids.set_lod(cpu_pre_ids.lod());
+    pre_scores.set_lod(cpu_pre_scores.lod());
+  }
+
+  paddle::framework::LoDTensor selected_ids;
+  paddle::framework::LoDTensor selected_scores;
+
+  size_t level = 0;
+  size_t beam_size = 2;
+  int end_id = 0;
+  paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
+  beamsearch(*context, &pre_ids, &pre_scores, &ids, &scores, &selected_ids,
+             &selected_scores, level, beam_size, end_id, true);
+
+  ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
+
+  paddle::framework::LoDTensor cpu_selected_ids;
+  paddle::framework::LoDTensor cpu_selected_scores;
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_selected_ids = selected_ids;
+    cpu_selected_scores = selected_scores;
+  } else {
+    TensorCopySync(selected_ids, paddle::platform::CPUPlace(),
+                   &cpu_selected_ids);
+    TensorCopySync(selected_scores, paddle::platform::CPUPlace(),
+                   &cpu_selected_scores);
+    cpu_selected_ids.set_lod(selected_ids.lod());
+    cpu_selected_scores.set_lod(selected_scores.lod());
+  }
+
+  std::vector<int64_t> expected_ids({4, 5, 3, 8});
+  std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
+    ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
+  }
+
+  delete place;
+  delete context;
+}
+
+TEST(BeamSearch, CPU) {
+  TestBeamSearch<paddle::platform::CPUDeviceContext,
+                 paddle::platform::CPUPlace>();
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(BeamSearch, GPU) {
+  TestBeamSearch<paddle::platform::CUDADeviceContext,
+                 paddle::platform::CUDAPlace>();
+}
+#endif
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 2708f3bcd8..238d9f2905 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sampler.h"
+#include <glog/logging.h>
 #include <iostream>
 #include <queue>
 #include <utility>
@@ -77,7 +78,14 @@ int64_t CustomSampler::Sample() const {
   auto index = (*int_dist_)(*random_engine_);
   auto p = (*real_dist_)(*random_engine_);
   if (p > alias_probs_[index]) {
-    return alias_[index];
+    int alias = alias_[index];
+
+    if (alias == exceptional_val) {
+      LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val;
+      return index;
+    }
+
+    return alias;
   } else {
     return index;
   }
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 98e0b898a5..3fa5a7ae33 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -116,6 +116,7 @@ class CustomSampler : public Sampler {
   const float* alias_probs_;
   const int* alias_;
   const float* probs_;
+  const int exceptional_val = -1;
   std::shared_ptr<std::mt19937> random_engine_;
   std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
   std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index f15b37a1e3..aedb82da2f 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -354,7 +354,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
 
   auto* out_data = output->value().data<float>();
   for (size_t i = 0; i < ret_rows.size(); ++i) {
-    for (size_t j = 0; j < row_numel; ++j) {
+    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
       EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
     }
   }
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 73d83fa2e4..74892316e6 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -301,7 +301,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
 
   auto* out_data = output_cpu.data<float>();
   for (size_t i = 0; i < ret_rows.size(); ++i) {
-    for (size_t j = 0; j < row_numel; ++j) {
+    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
       EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
     }
   }
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index 5535523e79..cf6e89b3d9 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -66,7 +66,7 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
     cpu_in_grad.set_lod(in_grad.lod());
   }
 
-  EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim);
+  EXPECT_EQ(in_grad.numel(), static_cast<int64_t>(lod[0].back() * second_dim));
   EXPECT_EQ(in_grad.lod(), lod);
 
   if (paddle::platform::is_cpu_place(*place)) {
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 2c97eef096..3e48b67a57 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -119,6 +119,11 @@ class NCEKernel : public framework::OpKernel<T> {
     PrepareSamples<DeviceContext, T>(context, sampler);
     auto sample_labels = context.Output<Tensor>("SampleLabels");
     const int64_t *sample_labels_data = sample_labels->data<int64_t>();
+
+    for (int x = 0; x < sample_labels->numel(); x++) {
+      PADDLE_ENFORCE_GE(sample_labels_data[x], 0, "nce sample label %d", x);
+    }
+
     auto sample_out = context.Output<Tensor>("SampleLogits");
     T *sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
     auto label = context.Input<Tensor>("Label");
diff --git a/paddle/fluid/operators/ngraph/CMakeLists.txt b/paddle/fluid/operators/ngraph/CMakeLists.txt
new file mode 100644
index 0000000000..6b256ef026
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/CMakeLists.txt
@@ -0,0 +1,5 @@
+if(WITH_NGRAPH)
+  cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
+  cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
+  op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
+endif()
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
similarity index 55%
rename from paddle/fluid/framework/ngraph_bridge.cc
rename to paddle/fluid/operators/ngraph/ngraph_bridge.cc
index 365870c54e..d6e897ed46 100644
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -17,39 +17,39 @@ limitations under the License. */
 #include <vector>
 
 #include "ngraph/ngraph.hpp"
-#include "paddle/fluid/framework/ngraph_bridge.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
 #include "paddle/fluid/operators/ngraph/ngraph_ops.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
-namespace framework {
+namespace operators {
 
 namespace NG_OPS = paddle::operators::ngraphs;
 std::map<std::string,
-         std::function<void(const std::shared_ptr<OperatorBase>&,
+         std::function<void(const std::shared_ptr<framework::OperatorBase>&,
                             std::shared_ptr<std::unordered_map<
                                 std::string, std::shared_ptr<ngraph::Node>>>)>>
     NgraphBridge::NG_NODE_MAP = {
         {"elementwise_add", NG_OPS::BuildElementwiseAddNode},
         {"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
-        {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode},
-        {"mean", paddle::operators::ngraphs::BuildMeanNode},
-        {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode},
-        {"mul", paddle::operators::ngraphs::BuildMulNode},
-        {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
-        {"softmax", paddle::operators::ngraphs::BuildSoftmaxNode},
-        {"softmax_grad", paddle::operators::ngraphs::BuildSoftmaxGradNode},
-        {"scale", paddle::operators::ngraphs::BuildScaleNode},
-        {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
-        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>},
-        {"top_k", paddle::operators::ngraphs::BuildTopKNode}};
-
-void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
+        {"fill_constant", NG_OPS::BuildFillConstantNode},
+        {"mean", NG_OPS::BuildMeanNode},
+        {"mean_grad", NG_OPS::BuildMeanGradNode},
+        {"mul", NG_OPS::BuildMulNode},
+        {"mul_grad", NG_OPS::BuildMulGradNode},
+        {"softmax", NG_OPS::BuildSoftmaxNode},
+        {"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
+        {"scale", NG_OPS::BuildScaleNode},
+        {"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
+        {"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
+        {"top_k", NG_OPS::BuildTopKNode}};
+
+void NgraphBridge::BuildNgNode(
+    const std::shared_ptr<framework::OperatorBase>& op) {
   auto& op_type = op->Type();
   NG_NODE_MAP[op_type](op, ngb_node_map_);
 }
 
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h
similarity index 84%
rename from paddle/fluid/framework/ngraph_bridge.h
rename to paddle/fluid/operators/ngraph/ngraph_bridge.h
index 5ad7b8daeb..c57988f8f6 100644
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h
@@ -21,16 +21,16 @@ limitations under the License. */
 
 #include "ngraph/node.hpp"
 
-namespace paddle {
-namespace framework {
+#include "paddle/fluid/framework/operator.h"
 
-class OperatorBase;
+namespace paddle {
+namespace operators {
 
 class NgraphBridge {
  public:
   static std::map<
       std::string,
-      std::function<void(const std::shared_ptr<OperatorBase>&,
+      std::function<void(const std::shared_ptr<framework::OperatorBase>&,
                          std::shared_ptr<std::unordered_map<
                              std::string, std::shared_ptr<ngraph::Node>>>)>>
       NG_NODE_MAP;
@@ -41,7 +41,7 @@ class NgraphBridge {
           var_node_map)
       : ngb_node_map_(var_node_map) {}
 
-  void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op);
 
  private:
   std::shared_ptr<
@@ -49,5 +49,5 @@ class NgraphBridge {
       ngb_node_map_;
 };
 
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
new file mode 100644
index 0000000000..bec4b514a2
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -0,0 +1,491 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+
+namespace paddle {
+namespace operators {
+
+static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
+  ngraph::Shape sp;
+  for (int i = 0; i < dims.size(); ++i) {
+    int k = dims[i];
+    k = k == 0 ? 1 : k;
+    sp.push_back(k);
+  }
+  return sp;
+}
+
+static std::map<framework::proto::VarType::Type, ngraph::element::Type>
+    pd2ng_type_map = {
+        {framework::proto::VarType::FP32, ngraph::element::f32},
+        {framework::proto::VarType::FP64, ngraph::element::f64},
+        {framework::proto::VarType::INT32, ngraph::element::i32},
+        {framework::proto::VarType::INT64, ngraph::element::i64},
+        {framework::proto::VarType::BOOL, ngraph::element::boolean},
+};
+
+std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+    NgraphEngine::func_cache_ = {};
+
+std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
+    ngraph::runtime::Backend::create("CPU");
+
+static std::vector<std::vector<int>> NgraphOpIntervals(
+    framework::BlockDesc* block) {
+  std::vector<std::vector<int>> intervals;
+  auto ops = block->AllOps();
+  int size = ops.size();
+  int left = 0;
+  while (left < size && ops.at(left)->Type() != framework::kFeedOpType) {
+    ++left;
+  }
+  if (left == size) {
+    return intervals;
+  }
+  while (left < size && ops.at(left)->Type() == framework::kFeedOpType) {
+    ++left;
+  }
+
+  int right = left;
+  while (right < size && ops.at(right)->Type() != framework::kFetchOpType) {
+    ++right;
+  }
+  if (right == size) {
+    return intervals;
+  }
+  if (left >= right) return intervals;
+
+  // (left, right - 1) represents indices between feed and fetch
+  int pivot = left;
+  while (pivot < right) {
+    auto op_type = ops.at(pivot)->Type();
+    if (NgraphBridge::NG_NODE_MAP.find(op_type) ==
+        NgraphBridge::NG_NODE_MAP.end()) {
+      ++pivot;
+    } else {
+      int start = pivot, end = start;
+      while (pivot < right &&
+             (NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) !=
+              NgraphBridge::NG_NODE_MAP.end())) {
+        ++pivot;
+        ++end;
+      }
+      std::vector<int> interval = {start, end};
+      intervals.push_back(interval);
+    }
+  }  // end while
+  return intervals;
+}
+
+static void SubstituteNgraphOp(framework::BlockDesc* block,
+                               std::string block_str,
+                               std::vector<int> interval) {
+  framework::ProgramDesc program;
+  block->RemoveOp(interval.at(0), interval.at(1));
+  auto* ng_op = block->InsertOp(interval.at(0));
+  ng_op->SetType("ngraph_engine");
+  ng_op->SetAttr("interval", interval);
+  ng_op->SetAttr("graph", block_str);
+}
+
+// TODO(baojun-nervana): Move EnableNgraph to compile time per PR #15089
+void NgraphEngine::EnableNgraph(const framework::ProgramDesc& program) {
+#ifdef PADDLE_WITH_NGRAPH
+  VLOG(4) << "use_ngraph=True";
+  for (size_t bid = 0; bid < program.Size(); ++bid) {
+    // TODO(baojun-nervana): Remove the const_cast
+    auto* block =
+        const_cast<framework::ProgramDesc&>(program).MutableBlock(bid);
+    std::string block_str = block->Proto()->SerializeAsString();
+    auto intervals = NgraphOpIntervals(block);
+    for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
+      SubstituteNgraphOp(block, block_str, *it);
+    }
+  }
+#else
+  LOG(WARNING)
+      << "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
+#endif
+}
+
+NgraphEngine::NgraphEngine(const framework::Scope& scope,
+                           const platform::Place& place,
+                           const std::string& serialized_graph,
+                           const std::vector<int>& interval)
+    : scope_(scope), place_(place) {
+  var_in_node_map_ = std::make_shared<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+  var_node_map_ = std::make_shared<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+  func_cache_key_ = std::to_string(interval[0]) + std::to_string(interval[1]) +
+                    serialized_graph;
+
+  framework::proto::BlockDesc bdesc;
+  bdesc.ParseFromString(serialized_graph);
+  framework::BlockDesc block(nullptr, &bdesc);
+
+  Prepare(block, interval);
+
+  BuildNgIO();
+
+  GetNgFunction();
+}
+
+void NgraphEngine::Prepare(const framework::BlockDesc& block,
+                           const std::vector<int>& interval) {
+  for (auto& var : block.AllVars()) {
+    if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
+          var->GetType() == framework::proto::VarType::LOD_TENSOR ||
+          var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) {
+      continue;
+    }
+
+    auto var_name = var->Name();
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (var_name != framework::kFeedOpType &&
+        var_name != framework::kFetchOpType) {
+      auto pd_type = var->GetDataType();
+      if (pd2ng_type_map.find(pd_type) == pd2ng_type_map.end()) {
+        PADDLE_THROW("Data type of var %s not found in pd2ng_type_map",
+                     var_name);
+      }
+      var_type_map_[var_name] = pd2ng_type_map[pd_type];
+    }
+
+    if (var->Persistable()) {
+      persistables_.insert(var->Name());
+    }
+  }
+
+  auto ops_desc = block.AllOps();
+  int idx = interval[0];
+  while (idx < interval[1]) {
+    auto op_desc = ops_desc.at(idx);
+    auto op = framework::OpRegistry::CreateOp(*op_desc);
+    fused_ops_.push_back(std::move(op));
+    ++idx;
+  }
+
+  while (ops_desc.at(idx)->Type() != framework::kFetchOpType) {
+    auto op_desc = ops_desc.at(idx);
+    for (auto& var_name_item : op_desc->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        post_op_inputs_.insert(var_name);
+      }
+    }
+    ++idx;
+  }
+
+  while (idx < static_cast<int>(ops_desc.size()) &&
+         ops_desc.at(idx)->Type() == framework::kFetchOpType) {
+    std::string fetch_target_name = ops_desc.at(idx)->Input("X")[0];
+    fetches_.insert(fetch_target_name);
+    ++idx;
+  }
+
+  if (ops_desc.at(interval.at(0) - 1)->Type() == framework::kFeedOpType &&
+      ops_desc.at(interval.at(1))->Type() == framework::kFetchOpType) {
+    ng_op_state_ = OpState::FULL;
+  }
+
+  for (auto* op_desc : ops_desc) {
+    if (op_desc->Type().find("_grad") != std::string::npos) {
+      ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TRAIN
+                                                   : OpState::PARTIAL_TRAIN;
+      break;
+    }
+  }
+
+  if (ng_op_state_ != OpState::FULL_TRAIN &&
+      ng_op_state_ != OpState::PARTIAL_TRAIN) {
+    ng_op_state_ = ng_op_state_ == OpState::FULL ? OpState::FULL_TEST
+                                                 : OpState::PARTIAL_TEST;
+  }
+}
+
+void NgraphEngine::GetNgInputShape(
+    std::shared_ptr<framework::OperatorBase> op) {
+  framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+  op->RuntimeInferShape(scope_, place_, ctx);
+  for (auto& var_name_item : op->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && var->IsType<framework::LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto sp = Ddim2Shape(tensor_pd->dims());
+        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
+            var_in_.end()) {
+          if (var_node_map_->find(var_name) == var_node_map_->end()) {
+            // auto ng_type = pd2ng_type_map.at(GetDataTypeOfVar(var));
+            auto ng_type = var_type_map_.at(var_name);
+            auto prm =
+                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
+            (*var_node_map_)[var_name] = prm;
+            (*var_in_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+}
+
+void NgraphEngine::BuildNgNodes() {
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      for (auto& var_name : var_name_item.second) {
+        if (var_node_map_->find(var_name) == var_node_map_->end()) {
+          auto* var = scope_.FindVar(var_name);
+          if (var && var->IsType<framework::LoDTensor>()) {
+            auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+            auto& ddim = tensor_pd->dims();
+            auto ng_shape = Ddim2Shape(ddim);
+            auto ng_type = var_type_map_.at(var_name);
+            auto prm = std::make_shared<ngraph::op::Parameter>(ng_type,
+                                                               ng_shape, true);
+            (*var_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+  NgraphBridge ngb(var_node_map_);
+  for (auto& op : fused_ops_) {
+    ngb.BuildNgNode(op);
+  }
+}
+
+void NgraphEngine::BuildNgIO() {
+  std::unordered_set<std::string> inputs;
+  std::unordered_set<std::string> outputs;
+
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        inputs.insert(var_name);
+        const bool is_output = outputs.find(var_name) != outputs.end();
+        if (!is_output &&
+            std::find(var_in_.begin(), var_in_.end(), var_name) ==
+                var_in_.end()) {
+          // fill var_in here to keep lhs and rhs order
+          var_in_.push_back(var_name);
+        }
+      }
+    }
+
+    if (op->Type() != "fill_constant") {
+      GetNgInputShape(op);
+    }
+
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        outputs.insert(var_name);
+      }
+    }
+  }
+
+  // var_out.clear();
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        switch (ng_op_state_) {
+          case OpState::PARTIAL_TEST:
+            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case OpState::FULL_TEST:
+            if (fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case OpState::PARTIAL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case OpState::FULL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          default:
+            var_out_.push_back(var_name);
+        }
+      }
+    }
+  }
+}
+
+void NgraphEngine::BuildNgFunction() {
+  BuildNgNodes();
+  ngraph_function_ = nullptr;
+  ngraph::NodeVector func_outputs;
+  ngraph::ParameterVector func_inputs;
+
+  for (auto& vo : var_out_) {
+    func_outputs.push_back(var_node_map_->at(vo));
+  }
+
+  for (auto& vi : var_in_) {
+    std::shared_ptr<ngraph::op::Parameter> prm =
+        std::dynamic_pointer_cast<ngraph::op::Parameter>(
+            var_in_node_map_->at(vi));
+    func_inputs.push_back(prm);
+  }
+
+  ngraph_function_ =
+      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
+}
+
+void NgraphEngine::GetNgFunction() {
+  bool cache_on = true;
+  if (cache_on) {
+    std::string input_shape_str;
+    for (auto& var_name : var_in_) {
+      auto shape = var_node_map_->at(var_name)->get_shape();
+      for (size_t i = 0; i < shape.size(); ++i) {
+        input_shape_str += std::to_string(shape.at(i));
+      }
+    }
+    func_cache_key_ = input_shape_str + func_cache_key_;
+    if (func_cache_.find(func_cache_key_) != func_cache_.end()) {
+      ngraph_function_ = func_cache_.at(func_cache_key_);
+    } else {
+      BuildNgFunction();
+      func_cache_[func_cache_key_] = ngraph_function_;
+    }
+  } else {
+    BuildNgFunction();
+  }
+}
+
+void NgraphEngine::Run(const framework::Scope& scope,
+                       const platform::Place& place) const {
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
+
+  for (size_t i = 0; i < var_in_.size(); ++i) {
+    auto vi = var_in_.at(i);
+    auto sp = var_node_map_->at(vi)->get_shape();
+    std::shared_ptr<ngraph::runtime::Tensor> ti;
+    auto* var = scope.FindVar(vi);
+    if (var && var->IsType<framework::LoDTensor>()) {
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
+                     "Ensure ngraph tensor layout align with paddle tensor");
+      auto ng_type = var_type_map_.at(vi);
+      if (ng_type == ngraph::element::f32) {
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
+        ti = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i32) {
+        const int* arr = tensor_pd->data<int>();
+        ti = backend_->create_tensor(ngraph::element::i32, sp,
+                                     const_cast<int*>(arr));
+      } else if (ng_type == ngraph::element::i64) {
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
+        ti = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::f64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
+        ti = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::boolean) {
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
+        ti = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Data type not handling for var %s", vi);
+      }
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+    }
+    bool is_test = (ng_op_state_ == OpState::PARTIAL_TEST ||
+                    ng_op_state_ == OpState::FULL_TEST)
+                       ? true
+                       : false;
+    bool is_persistable =
+        (persistables_.find(vi) != persistables_.end()) ? true : false;
+    if (is_test && is_persistable) {
+      ti->set_stale(false);
+    }
+    t_in.push_back(ti);
+  }
+
+  for (size_t i = 0; i < var_out_.size(); ++i) {
+    auto vo = var_out_[i];
+    auto* var = scope.FindVar(vo);
+    std::shared_ptr<ngraph::runtime::Tensor> to;
+    if (var && var->IsType<framework::LoDTensor>()) {
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+      auto dd = tensor_pd->dims();
+      ngraph::Shape sp = Ddim2Shape(dd);
+      auto ng_type = var_type_map_.at(vo);
+      if (ng_type == ngraph::element::f32) {
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i64) {
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i32) {
+        auto pd_arr = tensor_pd->mutable_data<int>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::f64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else if (ng_type == ngraph::element::boolean) {
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
+        to = backend_->create_tensor(ng_type, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Data type not handled in for var %s", vo);
+      }
+      t_out.push_back(to);
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vo);
+    }
+  }
+
+  backend_->call(backend_->compile(ngraph_function_), t_out, t_in);
+}  // NgraphEngine::Run
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h
new file mode 100644
index 0000000000..bf5ff2a743
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace operators {
+
+enum class OpState {                /* nGraph support state on ops          */
+                     FULL_TRAIN,    /* Support full ops for train           */
+                     PARTIAL_TRAIN, /* Support partial ops for train        */
+                     FULL_TEST,     /* Support full list of ops for test    */
+                     PARTIAL_TEST,  /* Support partial list of ops for test */
+                     FULL,          /* All ops supported from feed to fetch */
+                     UNKNOWN        /* Output all for debug purpose         */
+};
+
+// perform graph build through bridge and execute computation
+class NgraphEngine {
+ public:
+  explicit NgraphEngine(const framework::Scope& scope,
+                        const platform::Place& place,
+                        const std::string& serialized_graph,
+                        const std::vector<int>& interval);
+
+  void Run(const framework::Scope& scope, const platform::Place& place) const;
+
+  static void EnableNgraph(const framework::ProgramDesc& program);
+
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+      func_cache_;
+  const framework::Scope& scope_;
+  const platform::Place& place_;
+  std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
+  std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
+  std::unordered_set<std::string> persistables_;
+  std::unordered_set<std::string> fetches_;
+  std::unordered_set<std::string> post_op_inputs_;
+  OpState ng_op_state_ = OpState::UNKNOWN;
+  std::string func_cache_key_;
+
+  // ngraph backend eg. CPU
+  static std::shared_ptr<ngraph::runtime::Backend> backend_;
+  // ngraph function to call and execute
+  std::shared_ptr<ngraph::Function> ngraph_function_;
+  // var_name of inputs
+  std::vector<std::string> var_in_;
+  // var_name of outputs from  fetch in order
+  std::vector<std::string> var_out_;
+  // map input vars to nodes
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_in_node_map_;
+  // map each var name with a ngraph node
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_node_map_;
+  // prepare info for nraph engine
+  void Prepare(const framework::BlockDesc& block,
+               const std::vector<int>& interval);
+  // get ngraph input and define ngraph input parameters
+  void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op);
+  // Call ngraph bridge to map ops
+  void BuildNgNodes();
+  // get the ngraph input and output var list
+  void BuildNgIO();
+  // build ngraph function call
+  void BuildNgFunction();
+  // Check cache for ngraph function or otherwise build the function
+  void GetNgFunction();
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
new file mode 100644
index 0000000000..3051ca123b
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine_op.h"
+
+namespace paddle {
+namespace operators {
+
+class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDispensable();
+    AddOutput("Ys", "A list of outputs").AsDispensable();
+    AddAttr<std::string>("graph", "the graph.");
+    AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
+    AddComment("ngraph engine operator.");
+  }
+};
+
+class NgraphEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(ngraph_engine, ops::NgraphEngineOp, ops::NgraphEngineOpMaker,
+                  ops::NgraphEngineOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ngraph_engine,
+    ops::NgraphEngineKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.h b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
new file mode 100644
index 0000000000..d2974298b0
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+class NgraphEngineOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::proto::VarType::FP32, ctx.GetPlace());
+    return kt;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NgraphEngineKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& scope = ctx.scope();
+    auto place = ctx.GetPlace();
+    std::string serialized_graph = ctx.Attr<std::string>("graph");
+    auto interval = ctx.Attr<std::vector<int>>("interval");
+
+    NgraphEngine ngraph_engine(scope, place, serialized_graph, interval);
+    ngraph_engine.Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
index 58a465d87a..2a3e80c915 100644
--- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
@@ -41,13 +41,19 @@ class CreateCTRReaderOp : public framework::OperatorBase {
     auto* queue_holder =
         queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
 
-    int thread_num = Attr<int>("thread_num");
-    std::vector<std::string> slots = Attr<std::vector<std::string>>("slots");
-    int batch_size = Attr<int>("batch_size");
-    std::vector<std::string> file_list =
-        Attr<std::vector<std::string>>("file_list");
-    out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), batch_size,
-                                           thread_num, slots, file_list));
+    auto thread_num = Attr<int>("thread_num");
+    auto sparse_slots = Attr<std::vector<std::string>>("sparse_slots");
+    auto dense_slot_index = Attr<std::vector<int>>("dense_slot_index");
+    auto sparse_slot_index = Attr<std::vector<int>>("sparse_slot_index");
+    auto batch_size = Attr<int>("batch_size");
+    auto file_type = Attr<std::string>("file_type");
+    auto file_format = Attr<std::string>("file_format");
+    auto file_list = Attr<std::vector<std::string>>("file_list");
+    DataDesc data_desc(batch_size, file_list, file_type, file_format,
+                       dense_slot_index, sparse_slot_index, sparse_slots);
+    VLOG(1) << data_desc;
+    out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), thread_num,
+                                           data_desc));
   }
 };
 
@@ -58,10 +64,22 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase {
              "Name of the `LoDTensorBlockingQueueHolder` variable");
     AddAttr<int>("thread_num", "the thread num to read data");
     AddAttr<int>("batch_size", "the batch size of read data");
+    AddAttr<std::string>("file_type", "plain or gzip").SetDefault("plain");
+    AddAttr<std::string>("file_format", "svm or csv").SetDefault("csv");
     AddAttr<std::vector<std::string>>("file_list",
                                       "The list of files that need to read");
-    AddAttr<std::vector<std::string>>(
-        "slots", "the slots that should be extract from file");
+    AddAttr<std::vector<int>>(
+        "dense_slot_index",
+        "the dense slots id that should be extract from file")
+        .SetDefault({});
+    AddAttr<std::vector<int>>(
+        "sparse_slot_index",
+        "the sparse slots id that should be extract from file")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>("sparse_slots",
+                                      "the sparse slots id that should be "
+                                      "extract from file, used when file "
+                                      "format is svm");
 
     AddComment(R"DOC(
 			Create CTRReader to support read ctr data with cpp.
diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc
index d1d3ddc89d..f08798794a 100644
--- a/paddle/fluid/operators/reader/ctr_reader.cc
+++ b/paddle/fluid/operators/reader/ctr_reader.cc
@@ -73,6 +73,9 @@ static inline void parse_line(
   }
 }
 
+// label slot1:fea_sign slot2:fea_sign slot1:fea_sign
+static inline void parse_svm_line(const std::string& line) {}
+
 class Reader {
  public:
   virtual ~Reader() {}
@@ -95,11 +98,27 @@ class GzipReader : public Reader {
   igzstream gzstream_;
 };
 
-class MultiGzipReader : public Reader {
+class PlainFileReader : public Reader {
  public:
-  explicit MultiGzipReader(const std::vector<std::string>& file_list) {
+  explicit PlainFileReader(const std::string& file_name)
+      : stream_(file_name.c_str()) {}
+
+  ~PlainFileReader() {}
+
+  bool HasNext() override { return stream_.peek() != EOF; }
+
+  void NextLine(std::string* line) override { std::getline(stream_, *line); }
+
+ private:
+  std::ifstream stream_;
+};
+
+template <typename SingleFileReader>
+class MultiFileReader : public Reader {
+ public:
+  explicit MultiFileReader(const std::vector<std::string>& file_list) {
     for (auto& file : file_list) {
-      readers_.emplace_back(std::make_shared<GzipReader>(file));
+      readers_.emplace_back(std::make_shared<SingleFileReader>(file));
     }
   }
 
@@ -119,46 +138,35 @@ class MultiGzipReader : public Reader {
   }
 
  private:
-  std::vector<std::shared_ptr<GzipReader>> readers_;
+  std::vector<std::shared_ptr<SingleFileReader>> readers_;
   size_t current_reader_index_ = 0;
 };
 
 void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
                    std::shared_ptr<LoDTensorBlockingQueue> queue) {
-  VLOG(30) << "monitor thread in";
+  VLOG(3) << "monitor thread in";
   bool reader_thread_is_running = true;
   while (reader_thread_is_running) {
-    VLOG(30) << "reader_thread_is_running";
+    VLOG(3) << "reader_thread_is_running";
     reader_thread_is_running = false;
     for (size_t i = 0; i < (*thread_status).size(); ++i) {
       if ((*thread_status)[i] == Running) {
-        VLOG(30) << "reader is running!";
+        VLOG(3) << "reader is running!";
         reader_thread_is_running = true;
       }
     }
     std::this_thread::sleep_for(std::chrono::milliseconds(1000));
   }
-  VLOG(30) << "all reader thread is stopped, push empty data into queue";
-  queue->Push({});
-  VLOG(30) << "monitor thread exited";
+  VLOG(3) << "all reader thread is stopped, close the queue";
+  queue->Close();
+  VLOG(3) << "monitor thread exited";
 }
 
-void ReadThread(const std::vector<std::string>& file_list,
-                const std::vector<std::string>& slots, int batch_size,
-                int thread_id, std::vector<ReaderThreadStatus>* thread_status,
-                std::shared_ptr<LoDTensorBlockingQueue> queue) {
-  VLOG(30) << "[" << thread_id << "]"
-           << " reader thread start! thread_id = " << thread_id;
-  for (auto& file : file_list) {
-    VLOG(30) << "[" << thread_id << "]"
-             << " file " << file;
-  }
-  (*thread_status)[thread_id] = Running;
-  VLOG(30) << "set status to running";
-
+void ReadSvmData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
+                 std::shared_ptr<LoDTensorBlockingQueue> queue) {
   std::unordered_map<std::string, size_t> slot_to_index;
-  for (size_t i = 0; i < slots.size(); ++i) {
-    slot_to_index[slots[i]] = i;
+  for (size_t i = 0; i < data_desc.sparse_slot_ids_.size(); ++i) {
+    slot_to_index[data_desc.sparse_slot_ids_[i]] = i;
   }
 
   std::string line;
@@ -166,21 +174,17 @@ void ReadThread(const std::vector<std::string>& file_list,
   std::vector<std::unordered_map<std::string, std::vector<int64_t>>> batch_data;
   std::vector<int64_t> batch_label;
 
-  MultiGzipReader reader(file_list);
-
-  VLOG(30) << "reader inited";
-
-  while (reader.HasNext()) {
+  while (reader->HasNext()) {
     batch_data.clear();
-    batch_data.reserve(batch_size);
+    batch_data.reserve(data_desc.batch_size_);
 
     batch_label.clear();
-    batch_label.reserve(batch_size);
+    batch_label.reserve(data_desc.batch_size_);
 
     // read batch_size data
-    for (int i = 0; i < batch_size; ++i) {
-      if (reader.HasNext()) {
-        reader.NextLine(&line);
+    for (int i = 0; i < data_desc.batch_size_; ++i) {
+      if (reader->HasNext()) {
+        reader->NextLine(&line);
         std::unordered_map<std::string, std::vector<int64_t>> slot_to_data;
         int64_t label;
         parse_line(line, slot_to_index, &label, &slot_to_data);
@@ -193,8 +197,8 @@ void ReadThread(const std::vector<std::string>& file_list,
 
     std::vector<framework::LoDTensor> lod_datas;
 
-    // first insert tensor for each slots
-    for (auto& slot : slots) {
+    // first insert tensor for each sparse_slots
+    for (auto& slot : data_desc.sparse_slot_ids_) {
       std::vector<size_t> lod_data{0};
       std::vector<int64_t> batch_feasign;
 
@@ -226,11 +230,167 @@ void ReadThread(const std::vector<std::string>& file_list,
     lod_datas.push_back(label_tensor);
 
     queue->Push(lod_datas);
-    VLOG(40) << "push one data, queue_size=" << queue->Size();
+    VLOG(4) << "push one data, queue_size=" << queue->Size();
+  }
+}
+
+// label dense_fea,dense_fea sparse_fea,sparse_fea
+static inline void parse_csv_line(
+    const std::string& line, const DataDesc& data_desc, int64_t* label,
+    std::vector<std::vector<float>>* dense_datas,
+    std::vector<std::vector<int64_t>>* sparse_datas) {
+  std::vector<std::string> ret;
+  string_split(line, ' ', &ret);
+  *label = std::stol(ret[0]);
+  dense_datas->resize(data_desc.dense_slot_index_.size());
+  for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) {
+    int slot_idx = data_desc.dense_slot_index_[i];
+    auto& slot_data = ret[slot_idx];
+    std::vector<std::string> data_in_slot_str;
+    string_split(slot_data, ',', &data_in_slot_str);
+    std::vector<float> data_in_slot;
+    for (auto& data_str : data_in_slot_str) {
+      (*dense_datas)[i].push_back(std::stof(data_str));
+    }
+  }
+  sparse_datas->resize(data_desc.sparse_slot_index_.size());
+  for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) {
+    int slot_idx = data_desc.sparse_slot_index_[i];
+    auto& slot_data = ret[slot_idx];
+    std::vector<std::string> data_in_slot_str;
+    string_split(slot_data, ',', &data_in_slot_str);
+    std::vector<int64_t> data_in_slot;
+    for (auto& data_str : data_in_slot_str) {
+      auto id = std::stol(data_str);
+      (*sparse_datas)[i].push_back(id);
+    }
+  }
+}
+
+void ReadCsvData(const DataDesc& data_desc, std::shared_ptr<Reader> reader,
+                 std::shared_ptr<LoDTensorBlockingQueue> queue) {
+  std::string line;
+  while (reader->HasNext()) {
+    std::vector<int64_t> batch_label;
+    batch_label.reserve(data_desc.batch_size_);
+
+    std::vector<std::vector<std::vector<float>>> batch_dense_data;
+    batch_dense_data.reserve(data_desc.batch_size_);
+
+    std::vector<std::vector<std::vector<int64_t>>> batch_sparse_data;
+    batch_sparse_data.reserve(data_desc.batch_size_);
+
+    // read batch_size data
+    for (int i = 0; i < data_desc.batch_size_; ++i) {
+      if (reader->HasNext()) {
+        reader->NextLine(&line);
+        int64_t label;
+        std::vector<std::vector<float>> dense_datas;
+        std::vector<std::vector<int64_t>> sparse_datas;
+        parse_csv_line(line, data_desc, &label, &dense_datas, &sparse_datas);
+        batch_label.push_back(label);
+        if (!batch_dense_data.empty()) {
+          PADDLE_ENFORCE_EQ(batch_dense_data[0].size(), dense_datas.size(),
+                            "dense data should have the same shape");
+        }
+        batch_dense_data.push_back(dense_datas);
+        batch_sparse_data.push_back(sparse_datas);
+      } else {
+        break;
+      }
+    }
+
+    // the order of output data is label, dense_datas, sparse_datas
+    std::vector<framework::LoDTensor> lod_datas;
+
+    // insert label tensor
+    framework::LoDTensor label_tensor;
+    auto* label_tensor_data = label_tensor.mutable_data<int64_t>(
+        framework::make_ddim({static_cast<int64_t>(batch_label.size()), 1}),
+        platform::CPUPlace());
+    memcpy(label_tensor_data, batch_label.data(),
+           batch_label.size() * sizeof(int64_t));
+    lod_datas.push_back(label_tensor);
+
+    // insert tensor for each dense_slots
+    for (size_t i = 0; i < data_desc.dense_slot_index_.size(); ++i) {
+      framework::LoDTensor lod_tensor;
+      size_t width = batch_dense_data[0][i].size();
+      auto* tensor_data = lod_tensor.mutable_data<float>(
+          framework::make_ddim(
+              {static_cast<int64_t>(batch_dense_data.size()),  // batch_size
+               static_cast<int64_t>(width)}),
+          platform::CPUPlace());
+
+      for (size_t j = 0; j < batch_dense_data.size(); ++j) {
+        auto& dense_data_row = batch_dense_data[j][i];
+        memcpy(tensor_data + j * width, dense_data_row.data(),
+               width * sizeof(float));
+      }
+
+      lod_datas.push_back(lod_tensor);
+    }
+
+    // insert tensor for each sparse_slots
+    for (size_t i = 0; i < data_desc.sparse_slot_index_.size(); ++i) {
+      std::vector<size_t> lod_data{0};
+      std::vector<int64_t> batch_feasign;
+
+      for (size_t row_idx = 0; row_idx < batch_sparse_data.size(); ++row_idx) {
+        auto& sparse_ids = batch_sparse_data[row_idx][i];
+        lod_data.push_back(lod_data.back() + sparse_ids.size());
+        batch_feasign.insert(batch_feasign.end(), sparse_ids.begin(),
+                             sparse_ids.end());
+      }
+
+      framework::LoDTensor lod_tensor;
+      framework::LoD lod{lod_data};
+      lod_tensor.set_lod(lod);
+      int64_t* tensor_data = lod_tensor.mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(batch_feasign.size()), 1}),
+          platform::CPUPlace());
+      memcpy(tensor_data, batch_feasign.data(),
+             batch_feasign.size() * sizeof(int64_t));
+      lod_datas.push_back(lod_tensor);
+    }
+
+    queue->Push(lod_datas);
+    VLOG(4) << "push one data, queue_size=" << queue->Size();
+  }
+}
+
+void ReadThread(const std::vector<std::string>& file_list,
+                const DataDesc& data_desc, int thread_id,
+                std::vector<ReaderThreadStatus>* thread_status,
+                std::shared_ptr<LoDTensorBlockingQueue> queue) {
+  VLOG(3) << "[" << thread_id << "]"
+          << " reader thread start! thread_id = " << thread_id;
+  for (auto& file : file_list) {
+    VLOG(3) << "[" << thread_id << "]"
+            << " file " << file;
+  }
+  (*thread_status)[thread_id] = Running;
+  VLOG(3) << "set status to running";
+
+  std::shared_ptr<Reader> reader;
+  if (data_desc.file_type_ == "gzip") {
+    reader.reset(new MultiFileReader<GzipReader>(file_list));
+  } else if (data_desc.file_type_ == "plain") {
+    reader.reset(new MultiFileReader<PlainFileReader>(file_list));
+  } else {
+    PADDLE_THROW("do not support file format %s", data_desc.file_type_);
+  }
+
+  VLOG(3) << "reader inited";
+
+  if (data_desc.file_format_ == "svm") {
+    ReadSvmData(data_desc, reader, queue);
+  } else if (data_desc.file_format_ == "csv") {
+    ReadCsvData(data_desc, reader, queue);
   }
 
   (*thread_status)[thread_id] = Stopped;
-  VLOG(30) << "set status to stopped, thread " << thread_id << " exited";
+  VLOG(3) << "set status to stopped, thread " << thread_id << " exited";
 }
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h
index 56879ffda5..740cd5219c 100644
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -36,9 +36,63 @@ namespace reader {
 
 enum ReaderThreadStatus { Running, Stopped };
 
+struct DataDesc {
+  DataDesc(int batch_size, const std::vector<std::string>& file_names,
+           const std::string& file_type, const std::string& file_format,
+           const std::vector<int>& dense_slot_index,
+           const std::vector<int>& sparse_slot_index,
+           const std::vector<std::string>& sparse_slot_ids)
+      : batch_size_(batch_size),
+        file_names_(file_names),
+        file_type_(file_type),
+        file_format_(file_format),
+        dense_slot_index_(dense_slot_index),
+        sparse_slot_index_(sparse_slot_index),
+        sparse_slot_ids_(sparse_slot_ids) {}
+
+  const int batch_size_;
+  const std::vector<std::string> file_names_;
+  const std::string file_type_;    // gzip or plain
+  const std::string file_format_;  // csv or svm
+  // used for csv data format
+  const std::vector<int> dense_slot_index_;
+  const std::vector<int> sparse_slot_index_;
+  // used for svm data format
+  const std::vector<std::string> sparse_slot_ids_;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const DataDesc& data_desc) {
+  os << "data_desc:\n";
+  os << "\tbatch_size -> " << data_desc.batch_size_ << "\n";
+  os << "\tfile_type -> " << data_desc.file_type_ << "\n";
+  os << "\tfile_format -> " << data_desc.file_format_ << "\n";
+  os << "\tfile_names -> {";
+  for (auto& file_name : data_desc.file_names_) {
+    os << file_name << ",";
+  }
+  os << "}\n";
+  os << "\tdense_slot_index -> {";
+  for (auto& slot : data_desc.dense_slot_index_) {
+    os << slot << ",";
+  }
+  os << "}\n";
+  os << "\tsparse_slot_index_ -> {";
+  for (auto& slot : data_desc.sparse_slot_index_) {
+    os << slot << ",";
+  }
+  os << "}\n";
+  os << "\tsparse_slot_ids_ -> {";
+  for (auto& slot : data_desc.sparse_slot_ids_) {
+    os << slot << ",";
+  }
+  os << "}\n";
+
+  return os;
+}
+
 void ReadThread(const std::vector<std::string>& file_list,
-                const std::vector<std::string>& slots, int batch_size,
-                int thread_id, std::vector<ReaderThreadStatus>* thread_status,
+                const DataDesc& data_desc, int thread_id,
+                std::vector<ReaderThreadStatus>* thread_status,
                 std::shared_ptr<LoDTensorBlockingQueue> queue);
 
 // monitor all running thread, if they are all stopped,
@@ -48,15 +102,15 @@ void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
 
 class CTRReader : public framework::FileReader {
  public:
-  explicit CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
-                     int batch_size, size_t thread_num,
-                     const std::vector<std::string>& slots,
-                     const std::vector<std::string>& file_list)
-      : batch_size_(batch_size), slots_(slots), file_list_(file_list) {
+  CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
+            int thread_num, const DataDesc& data_desc)
+      : data_desc_(data_desc) {
     PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!");
     PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
-    PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty");
-    thread_num_ = std::min<size_t>(file_list_.size(), thread_num);
+    PADDLE_ENFORCE_GT(data_desc_.file_names_.size(), 0,
+                      "file list should not be empty");
+
+    thread_num_ = std::min<size_t>(data_desc_.file_names_.size(), thread_num);
     queue_ = queue;
     SplitFiles();
     for (size_t i = 0; i < thread_num_; ++i) {
@@ -64,7 +118,7 @@ class CTRReader : public framework::FileReader {
     }
   }
 
-  ~CTRReader() {}
+  ~CTRReader() { Shutdown(); }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
     bool success;
@@ -81,7 +135,10 @@ class CTRReader : public framework::FileReader {
     for (auto& read_thread : read_threads_) {
       read_thread->join();
     }
-    monitor_thread_->join();
+
+    if (monitor_thread_) {
+      monitor_thread_->join();
+    }
 
     read_threads_.clear();
     monitor_thread_.reset(nullptr);
@@ -95,9 +152,9 @@ class CTRReader : public framework::FileReader {
     queue_->ReOpen();
     VLOG(3) << "reopen success";
     VLOG(3) << "thread_num " << thread_num_;
-    for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
+    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
       read_threads_.emplace_back(new std::thread(std::bind(
-          &ReadThread, file_groups_[thread_id], slots_, batch_size_,
+          &ReadThread, file_groups_[thread_id], data_desc_,
           static_cast<int>(thread_id), &read_thread_status_, queue_)));
     }
     monitor_thread_.reset(new std::thread(
@@ -108,8 +165,8 @@ class CTRReader : public framework::FileReader {
  private:
   void SplitFiles() {
     file_groups_.resize(thread_num_);
-    for (size_t i = 0; i < file_list_.size(); ++i) {
-      auto& file_name = file_list_[i];
+    for (size_t i = 0; i < data_desc_.file_names_.size(); ++i) {
+      auto& file_name = data_desc_.file_names_[i];
       std::ifstream f(file_name.c_str());
       PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name);
       file_groups_[i % thread_num_].push_back(file_name);
@@ -118,9 +175,7 @@ class CTRReader : public framework::FileReader {
 
  private:
   size_t thread_num_;
-  const int batch_size_;
-  const std::vector<std::string> slots_;
-  const std::vector<std::string> file_list_;
+  const DataDesc data_desc_;
   std::shared_ptr<LoDTensorBlockingQueue> queue_;
   std::vector<std::unique_ptr<std::thread>> read_threads_;
   std::unique_ptr<std::thread> monitor_thread_;
diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc
index 8dba9baebc..9f3a254c84 100644
--- a/paddle/fluid/operators/reader/ctr_reader_test.cc
+++ b/paddle/fluid/operators/reader/ctr_reader_test.cc
@@ -36,6 +36,7 @@ using paddle::framework::LoD;
 using paddle::framework::DDim;
 using paddle::platform::CPUPlace;
 using paddle::framework::make_ddim;
+using paddle::operators::reader::DataDesc;
 
 static void generatedata(const std::vector<std::string>& data,
                          const std::string& file_name) {
@@ -126,30 +127,103 @@ TEST(CTR_READER, read_data) {
 
   LoDTensorBlockingQueueHolder queue_holder;
   int capacity = 64;
-  queue_holder.InitOnce(capacity, {}, false);
+  queue_holder.InitOnce(capacity, false);
 
   std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();
 
   int batch_size = 3;
   int thread_num = 1;
-  std::vector<std::string> slots = {"6002", "6003"};
+  std::vector<std::string> sparse_slots = {"6002", "6003"};
   std::vector<std::string> file_list;
   for (int i = 0; i < thread_num; ++i) {
     file_list.push_back(gz_file_name);
   }
 
-  CTRReader reader(queue, batch_size, thread_num, slots, file_list);
+  DataDesc data_desc(batch_size, file_list, "gzip", "svm", {}, {},
+                     sparse_slots);
+
+  CTRReader reader(queue, thread_num, data_desc);
 
   reader.Start();
   size_t batch_num =
       std::ceil(static_cast<float>(ctr_data.size()) / batch_size) * thread_num;
-  check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002,
-                 data_slot_6003, batch_num, batch_size, queue, &reader);
+  check_all_data(ctr_data, sparse_slots, label_dims, label_value,
+                 data_slot_6002, data_slot_6003, batch_num, batch_size, queue,
+                 &reader);
 
   reader.Shutdown();
 
   reader.Start();
-  check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002,
-                 data_slot_6003, batch_num, batch_size, queue, &reader);
+  check_all_data(ctr_data, sparse_slots, label_dims, label_value,
+                 data_slot_6002, data_slot_6003, batch_num, batch_size, queue,
+                 &reader);
   reader.Shutdown();
 }
+
+static void GenereteCsvData(const std::string& file_name,
+                            const std::vector<std::string>& data) {
+  std::ofstream out(file_name.c_str());
+  PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name);
+  for (auto& c : data) {
+    out << c;
+  }
+  out.close();
+  PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name);
+}
+
+static void CheckReadCsvOut(const std::vector<LoDTensor>& out) {
+  ASSERT_EQ(out.size(), 3);
+  ASSERT_EQ(out[0].dims()[1], 1);
+  ASSERT_EQ(out[1].dims()[1], 2);
+  ASSERT_EQ(out[2].dims()[1], 1);
+  for (size_t i = 0; i < out[0].numel(); ++i) {
+    int64_t label = out[0].data<int64_t>()[i];
+    auto& dense_dim = out[1].dims();
+    for (size_t j = 0; j < dense_dim[1]; ++j) {
+      ASSERT_EQ(out[1].data<float>()[i * dense_dim[1] + j],
+                static_cast<float>(label + 0.1));
+    }
+    auto& sparse_lod = out[2].lod();
+    for (size_t j = sparse_lod[0][i]; j < sparse_lod[0][i + 1]; ++j) {
+      ASSERT_EQ(out[2].data<int64_t>()[j], label);
+    }
+  }
+}
+
+TEST(CTR_READER, read_csv_data) {
+  std::string file_name = "test_ctr_reader_data.csv";
+  const std::vector<std::string> csv_data = {
+      "0 0.1,0.1 0,0,0,0\n", "1 1.1,1.1 1,1,1,1\n", "2 2.1,2.1 2,2,2,2\n",
+      "3 3.1,3.1 3,3,3,3\n",
+  };
+  GenereteCsvData(file_name, csv_data);
+
+  LoDTensorBlockingQueueHolder queue_holder;
+  int capacity = 64;
+  queue_holder.InitOnce(capacity, false);
+
+  std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();
+
+  int batch_size = 3;
+  int thread_num = 1;
+  std::vector<std::string> file_list;
+  for (int i = 0; i < thread_num; ++i) {
+    file_list.push_back(file_name);
+  }
+  DataDesc data_desc(batch_size, file_list, "plain", "csv", {1}, {2}, {});
+
+  CTRReader reader(queue, thread_num, data_desc);
+
+  for (size_t i = 0; i < 2; ++i) {
+    reader.Start();
+    std::vector<LoDTensor> out;
+    while (true) {
+      reader.ReadNext(&out);
+      if (out.empty()) {
+        break;
+      }
+      CheckReadCsvOut(out);
+    }
+    reader.Shutdown();
+  }
+}
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
index 3f041ff7e4..5b53edff5d 100644
--- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -32,10 +32,8 @@ class LoDTensorBlockingQueue {
   friend class LoDTensorBlockingQueueHolder;
 
  private:
-  LoDTensorBlockingQueue(size_t capacity,
-                         const std::vector<framework::DDim>& dims,
-                         bool speed_test_mode = false)
-      : queue_(capacity, speed_test_mode), dims_(dims) {}
+  explicit LoDTensorBlockingQueue(size_t capacity, bool speed_test_mode = false)
+      : queue_(capacity, speed_test_mode) {}
 
  public:
   bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
@@ -65,17 +63,15 @@ class LoDTensorBlockingQueue {
 
  private:
   BlockingQueue<std::vector<framework::LoDTensor>> queue_;
-  std::vector<framework::DDim> dims_;
 };
 
 class LoDTensorBlockingQueueHolder {
  public:
-  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims,
-                bool speed_test_mode = false) {
+  void InitOnce(size_t capacity, bool speed_test_mode = false) {
     PADDLE_ENFORCE(
         queue_ == nullptr,
         "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
-    queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode));
+    queue_.reset(new LoDTensorBlockingQueue(capacity, speed_test_mode));
   }
 
   inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index a0b70938d3..8fe638ac2f 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -27,13 +27,13 @@ class ReadInferShape : public framework::InferShapeBase {
                    "The ReadOp must take a reader as input.");
     PADDLE_ENFORCE(ctx->HasOutputs("Out"),
                    "The ReadOp should be assigned with output.");
-    std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
-    std::vector<std::string> out_names = ctx->Outputs("Out");
-    PADDLE_ENFORCE_EQ(
-        reader_dims.size(), out_names.size(),
-        "The reader's dim number doesn't match the output number.");
-    ctx->SetOutputsDim("Out", reader_dims);
-    if (!ctx->IsRuntime()) {
+    if (!ctx->IsRuntime() && ctx->Attrs().Get<bool>("infer_out")) {
+      std::vector<framework::DDim> reader_dims = ctx->GetReaderDims("Reader");
+      std::vector<std::string> out_names = ctx->Outputs("Out");
+      PADDLE_ENFORCE_EQ(
+          reader_dims.size(), out_names.size(),
+          "The reader's dim number doesn't match the output number.");
+      ctx->SetOutputsDim("Out", reader_dims);
       auto in_desc =
           boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Reader")[0]);
       auto in_lod_levels = in_desc->GetLoDLevels();
@@ -53,15 +53,18 @@ class ReadInferVarType : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDesc& op_desc,
                   framework::BlockDesc* block) const override {
-    std::string reader_name = op_desc.Input("Reader")[0];
-    std::vector<std::string> out_names = op_desc.Output("Out");
-    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-    auto dtypes = reader->GetDataTypes();
-    PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
-    for (size_t i = 0; i < dtypes.size(); ++i) {
-      framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
-      out.SetType(framework::proto::VarType::LOD_TENSOR);
-      out.SetDataType(dtypes[i]);
+    bool infer_out = boost::get<bool>(op_desc.GetAttr("infer_out"));
+    if (infer_out) {
+      std::string reader_name = op_desc.Input("Reader")[0];
+      std::vector<std::string> out_names = op_desc.Output("Out");
+      framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+      auto dtypes = reader->GetDataTypes();
+      PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
+      for (size_t i = 0; i < dtypes.size(); ++i) {
+        framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
+        out.SetType(framework::proto::VarType::LOD_TENSOR);
+        out.SetDataType(dtypes[i]);
+      }
     }
   }
 };
@@ -73,6 +76,7 @@ class ReadOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
+    VLOG(3) << "read op in";
     framework::ReaderHolder* reader =
         detail::Ref(scope.FindVar(Input("Reader")),
                     "Cannot find reader variable %s", Input("Reader"))
@@ -87,7 +91,9 @@ class ReadOp : public framework::OperatorBase {
 
     reader->ReadNext(&ins);
     if (ins.empty()) {
+      VLOG(3) << "read empty data in";
       if (Attr<bool>("throw_eof_exp")) {
+        VLOG(3) << "throw_eof_exp";
         PADDLE_THROW_EOF();
       } else {
         ins.resize(out_arg_names.size());
@@ -96,6 +102,7 @@ class ReadOp : public framework::OperatorBase {
           tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
         }
       }
+      VLOG(3) << "read empty data out";
     }
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
     for (size_t i = 0; i < out_arg_names.size(); ++i) {
@@ -120,6 +127,7 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
         " only when the data-balance is enabled in ParallelExecutor"
         " and it is set by ParallelExecutor instance, not users.")
         .SetDefault(true);
+    AddAttr<bool>("infer_out", "").SetDefault(true);
     AddComment(R"DOC(
       Read Operator
 
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index b82aab1214..3921eedf94 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -65,6 +65,10 @@ void FileReaderMakerBase::Make() {
       "It means the reader will generate two data each time,"
       "whose shapes are [2,3,4] and [5,6] respectively.");
   AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
+  AddAttr<bool>(
+      "use_data_config",
+      "Use the config of all datas like shape_concat/ranks/lod_levels")
+      .SetDefault(true);
   Apply();
 }
 
@@ -75,19 +79,23 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
 
   PADDLE_ENFORCE(ctx->HasOutput("Out"),
                  "The output file reader should not be null.");
-  const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat");
-  const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
-  std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-  ctx->SetReaderDims("Out", shapes);
-
-  const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
-  PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
-                    "The number of 'lod_levels'(%d) doesn't match the number "
-                    "of 'shapes'(%d).",
-                    lod_levels.size(), shapes.size());
-  framework::VarDesc* reader =
-      boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-  reader->SetLoDLevels(lod_levels);
+  bool use_data_config = ctx->Attrs().Get<bool>("use_data_config");
+  if (use_data_config) {
+    const auto shape_concat =
+        ctx->Attrs().Get<std::vector<int>>("shape_concat");
+    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    ctx->SetReaderDims("Out", shapes);
+
+    const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
+    PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
+                      "The number of 'lod_levels'(%d) doesn't match the number "
+                      "of 'shapes'(%d).",
+                      lod_levels.size(), shapes.size());
+    framework::VarDesc* reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    reader->SetLoDLevels(lod_levels);
+  }
 }
 
 void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index bcec6f3563..8d695fdedd 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/roi_align_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -255,8 +256,8 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
 
     Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     PADDLE_ENFORCE_EQ(
@@ -270,14 +271,18 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
         roi_batch_id_data[i] = n;
       }
     }
-    Tensor roi_batch_id_list_gpu;
-    framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
-                              &roi_batch_id_list_gpu);
-    GPUROIAlignForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = allocator.Allocate(bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
+    GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
         output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, sampling_ratio,
-        roi_batch_id_list_gpu.data<int>(),
+        height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data,
         out->mutable_data<T>(ctx.GetPlace()));
   }
 };
@@ -307,8 +312,8 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     }
     Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     for (int n = 0; n < rois_batch_size; ++n) {
@@ -316,24 +321,28 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
         roi_batch_id_data[i] = n;
       }
     }
-    Tensor roi_batch_id_list_gpu;
-    framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
-                              &roi_batch_id_list_gpu);
-
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto roi_ptr = allocator.Allocate(roi_batch_id_list.numel() * sizeof(int));
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
     in_grad->mutable_data<T>(ctx.GetPlace());
     math::SetConstant<Place, T> set_zero;
-    set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
+    set_zero(dev_ctx, in_grad, static_cast<T>(0));
 
     int output_grad_size = out_grad->numel();
     int blocks = NumBlocks(output_grad_size);
     int threads = kNumCUDAThreads;
 
     if (output_grad_size > 0) {
-      GPUROIAlignBackward<
-          T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
           output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
           spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_batch_id_list_gpu.data<int>(),
+          sampling_ratio, roi_id_data,
           in_grad->mutable_data<T>(ctx.GetPlace()));
     }
   }
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 75c3dd6bc4..ac3a4201e6 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/roi_pool_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -152,8 +153,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
 
     framework::Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     PADDLE_ENFORCE_EQ(
@@ -168,15 +169,20 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
       }
     }
 
-    framework::Tensor roi_batch_id_list_gpu;
-    framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &roi_batch_id_list_gpu);
-
-    GPUROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = allocator.Allocate(bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
+
+    GPUROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
         output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width,
-        roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
+        height, width, pooled_height, pooled_width, roi_id_data,
+        out->mutable_data<T>(ctx.GetPlace()),
         argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
 };
@@ -204,8 +210,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       framework::Tensor roi_batch_id_list;
       roi_batch_id_list.Resize({rois_num});
-      int* roi_batch_id_data =
-          roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+      auto cplace = platform::CPUPlace();
+      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
       auto rois_lod = rois->lod().back();
       int rois_batch_size = rois_lod.size() - 1;
       for (int n = 0; n < rois_batch_size; ++n) {
@@ -213,25 +219,30 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
           roi_batch_id_data[i] = n;
         }
       }
-      framework::Tensor roi_batch_id_list_gpu;
-      framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &roi_batch_id_list_gpu);
+
+      auto& dev_ctx = ctx.cuda_device_context();
+      auto& allocator =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+      int bytes = roi_batch_id_list.numel() * sizeof(int);
+      auto roi_ptr = allocator.Allocate(bytes);
+      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+      const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                   dev_ctx.stream());
 
       x_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
+      set_zero(dev_ctx, x_grad, static_cast<T>(0));
 
       int output_grad_size = out_grad->numel();
       int blocks = NumBlocks(output_grad_size);
       int threads = kNumCUDAThreads;
 
       if (output_grad_size > 0) {
-        GPUROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        GPUROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
             output_grad_size, rois->data<T>(), out_grad->data<T>(),
             argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width,
-            roi_batch_id_list_gpu.data<int>(),
+            width, pooled_height, pooled_width, roi_id_data,
             x_grad->mutable_data<T>(ctx.GetPlace()));
       }
     }
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index c07e6962e6..27e0201bd7 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -68,6 +68,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
                        "Level number of Input(X)'s lod could be 0. Otherwise "
                        "size of Input(X)'s first level lod should be equal to "
                        "size of Input(Y)'s referred level lod.");
+      } else {
+        PADDLE_ENFORCE_EQ(x_dims[0], y_lod[ref_level].size() - 1,
+                          "When Input(X)'s lod is null, the dims[0] of "
+                          "Input(X) should match the "
+                          "size of Input(Y)'s referred level lod.");
       }
 
       int64_t out_first_dim = 0;
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
new file mode 100644
index 0000000000..9349912e09
--- /dev/null
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -0,0 +1,113 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shuffle_channel_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ShuffleChannelOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ShuffleChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ShuffleChannelOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    ctx->SetOutputDim("Out", input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+
+class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of ShuffleChannelOp, the layout is NCHW.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), the output of "
+              "ShuffleChannelOp. The layout is NCHW.");
+    AddAttr<int>("group", "the number of groups.")
+        .SetDefault(1)
+        .AddCustomChecker([](const int& group) {
+          PADDLE_ENFORCE_GE(group, 1, "group should be larger than 0.");
+        });
+
+    AddComment(R"DOC(
+		Shuffle Channel operator
+		This opearator shuffles the channels of input x.
+		It  divide the input channels in each group into several subgroups,
+		and obtain a new order by selecting element from every subgroup one by one.
+
+		Shuffle channel operation makes it possible to build more powerful structures
+		with multiple group convolutional layers.
+		please get more information from the following paper:
+		https://arxiv.org/pdf/1707.01083.pdf
+        )DOC");
+  }
+};
+
+class ShuffleChannelGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@Grad) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@Grad) should not be null");
+
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp,
+                  ops::ShuffleChannelOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    shuffle_channel,
+    ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    shuffle_channel_grad,
+    ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
new file mode 100644
index 0000000000..9506343b3d
--- /dev/null
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shuffle_channel_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void ShuffleChannel(const int nthreads, const int feature_map_size,
+                               T* output, const T* input, int group_row,
+                               int group_column, int len) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t ii = index; ii < nthreads; ii += offset) {
+    const int n = index / group_row / group_column / len;
+    const int i = (index / group_column / len) % group_row;
+    const int j = index / len % group_column;
+    const int k = index - (n * feature_map_size + (i * group_column + j) * len);
+    T* p_o = output + n * feature_map_size + (j * group_row + i) * len;
+    p_o[k] = input[index];
+  }
+}
+template <typename DeviceContext, typename T>
+class ShuffleChannelOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    int group = ctx.Attr<int>("group");
+
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+    int group_row = group;
+    int group_column = channel / group_row;
+    // count is the product of NCHW same as numel()
+    int count = num * group_column * group_row * sp_sz;
+
+    int blocks = NumBlocks(output->numel());
+    int threads = kNumCUDAThreads;
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    ShuffleChannel<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        count, feature_map_size, output_data, input_data, group_row,
+        group_column, sp_sz);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    int group = ctx.Attr<int>("group");
+
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+
+    int group_row = group;
+    int group_column = channel / group_row;
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    const T* output_grad_data = output_grad->data<T>();
+
+    int blocks = NumBlocks(output_grad->numel());
+    int threads = kNumCUDAThreads;
+    int count = num * group_column * group_row * sp_sz;
+
+    ShuffleChannel<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        count, feature_map_size, input_grad_data, output_grad_data, group_row,
+        group_column, sp_sz);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    shuffle_channel,
+    ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
+REGISTER_OP_CUDA_KERNEL(
+    shuffle_channel_grad,
+    ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                        float>,
+    ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                        double>);
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
new file mode 100644
index 0000000000..f6af1bc885
--- /dev/null
+++ b/paddle/fluid/operators/shuffle_channel_op.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ShuffleChannelOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    int group = ctx.Attr<int>("group");
+
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+    int group_row = group;
+    int group_column = channel / group_row;
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    for (int n = 0; n < num; ++n) {
+      for (int i = 0; i < group_row; ++i) {
+        for (int j = 0; j < group_column; ++j) {
+          const T* p_i = input_data + n * feature_map_size +
+                         (i * group_column + j) * sp_sz;
+          T* p_o =
+              output_data + n * feature_map_size + (j * group_row + i) * sp_sz;
+          memcpy(p_o, p_i, sizeof(int) * sp_sz);
+        }
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ShuffleChannelGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    int group = ctx.Attr<int>("group");
+
+    auto input_dims = input->dims();
+    auto num = input_dims[0];
+    auto channel = input_dims[1];
+    auto height = input_dims[2];
+    auto weight = input_dims[3];
+    auto feature_map_size = channel * height * weight;
+    auto sp_sz = height * weight;
+
+    int group_row = group;
+    int group_column = channel / group_row;
+
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    const T* output_grad_data = output_grad->data<T>();
+    for (int n = 0; n < num; ++n) {
+      for (int i = 0; i < group_row; ++i) {
+        for (int j = 0; j < group_column; ++j) {
+          const T* p_i = output_grad_data + n * feature_map_size +
+                         (i * group_column + j) * sp_sz;
+          T* p_o = input_grad_data + n * feature_map_size +
+                   (j * group_row + i) * sp_sz;
+          memcpy(p_o, p_i, sizeof(int) * sp_sz);
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 14746fa951..c21b0c13c7 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -101,6 +101,10 @@ class SigmoidCrossEntropyWithLogitsOpMaker
     AddOutput("Out",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
               " of elementwise logistic losses.");
+    AddAttr<bool>("normalize",
+                  "if true, divide the loss by the number of "
+                  "targets != ignore_index.")
+        .SetDefault(false);
     AddAttr<int>("ignore_index",
                  "(int, default kIgnoreIndex), Specifies a target value that "
                  "is ignored and"
@@ -145,9 +149,14 @@ REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp);
-REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
-                       ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_cross_entropy_with_logits,
+    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
+                                             float>,
+    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
+                                             double>);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
+                           paddle::platform::CPUDeviceContext, float>,
+                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                           paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index a1fbc7e5fa..2a4570ef5c 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -11,12 +11,184 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "cub/cub.cuh"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static HOSTDEVICE float real_exp(float x) { return expf(x); }
+static HOSTDEVICE float real_exp(double x) { return exp(x); }
+static HOSTDEVICE float real_log(float x) { return logf(x); }
+static HOSTDEVICE float real_log(double x) { return log(x); }
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
+                                  const int ignore_index, const int limit,
+                                  T *out_data, T *counts) {
+  CUDA_1D_KERNEL_LOOP(i, limit) {
+    T x = x_data[i];
+    T label = label_data[i];
+    T eps = static_cast<T>(1e-5);
+    T diff = label - static_cast<T>(ignore_index);
+    if ((diff > -eps) && (diff < eps)) {
+      out_data[i] = static_cast<T>(0.);
+      counts[i] = 0;
+    } else {
+      T term1 = (x > 0) ? x : 0;
+      T term2 = x * label;
+      T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
+      out_data[i] = term1 - term2 + term3;
+      counts[i] = 1;
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void Sum(const T *counts, int num, const T eps, T *sum) {
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T in = 0;
+  for (int i = threadIdx.x; i < num; i += BlockDim) {
+    in += counts[i];
+  }
+  __syncthreads();
+  auto out =
+      BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    T a = out > eps ? out : eps;
+    sum[0] = a;
+  }
+}
+
+template <typename T>
+__global__ void Div(T *loss, const int num, const T *norm) {
+  CUDA_1D_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
+}
+
+template <typename T>
+__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
+                                   const int ignore_index, const T *dout_data,
+                                   const int limit, T *dx_data, T *counts) {
+  CUDA_1D_KERNEL_LOOP(i, limit) {
+    T x = x_data[i];
+    T label = label_data[i];
+    T dout = dout_data[i];
+    T eps = static_cast<T>(1e-5);
+    T diff = label - static_cast<T>(ignore_index);
+    if ((diff > -eps) && (diff < eps)) {
+      dx_data[i] = static_cast<T>(0.);
+      counts[i] = 0;
+    } else {
+      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
+      T diff = simoid_x - label;
+      dx_data[i] = dout * diff;
+      counts[i] = 1;
+    }
+  }
+}
+
+// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+template <typename DeviceContext, typename T>
+class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    Tensor *Out = context.Output<Tensor>("Out");
+    int ignore_index = context.Attr<int>("ignore_index");
+    auto out_data = Out->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx = context.cuda_device_context();
+    bool normalize = context.Attr<bool>("normalize");
+
+    // Temporary memory
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cnt_ptr = allocator.Allocate(Labels->numel() * sizeof(T));
+    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+
+    int limit = Out->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts);
+    if (normalize) {
+      auto norm_ptr = allocator.Allocate(sizeof(T));
+      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
+      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          counts, limit, static_cast<T>(1e-5), norm);
+      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm);
+    }
+  }
+};
+
+// dX = sigmoid(X) - labels
+template <typename DeviceContext, typename T>
+class GPUSigmoidCrossEntropyWithLogitsGradKernel
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto dx_data = dX->mutable_data<T>(context.GetPlace());
+
+    int ignore_index = context.Attr<int>("ignore_index");
+
+    auto &dev_ctx = context.cuda_device_context();
+    // Temporary memory
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cnt_ptr = allocator.Allocate(X->numel() * sizeof(T));
+    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+
+    int limit = dX->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit,
+        dx_data, counts);
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      auto norm_ptr = allocator.Allocate(sizeof(T));
+      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
+      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          counts, limit, static_cast<T>(1e-5), norm);
+      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
-                        ops::SigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, float>);
+                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
+                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index 6e75f9e0b8..8f459d573a 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -13,54 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/eigen.h"
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct SigmoidCrossEntropyWithLogitsForward {
-  HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
-      : ignore_index(ignore_index) {}
-
-  HOSTDEVICE T operator()(const T &x, const T &label) const {
-    if (static_cast<int>(label) == ignore_index) {
-      return static_cast<T>(0.);
-    }
-    T term1 = (x > 0) ? x : 0;
-    T term2 = x * label;
-    T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
-    return term1 - term2 + term3;
-  }
-
-  int ignore_index;
-};
-
-template <typename T>
-struct SigmoidCrossEntropyWithLogitsBackward {
-  HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
-      : ignore_index(ignore_index) {}
-
-  HOSTDEVICE T operator()(const T &x, const T &label) const {
-    if (static_cast<int>(label) == ignore_index) {
-      return static_cast<T>(0.);
-    }
-    T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
-    return simoid_x - label;
-  }
-
-  int ignore_index;
-};
 
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
@@ -70,16 +30,37 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
     const Tensor *X = context.Input<Tensor>("X");
     const Tensor *Labels = context.Input<Tensor>("Label");
     Tensor *Out = context.Output<Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
     int ignore_index = context.Attr<int>("ignore_index");
-
-    auto x = EigenVector<T>::Flatten(*X);
-    auto labels = EigenVector<T>::Flatten(*Labels);
-    auto out = EigenVector<T>::Flatten(*Out);
-    auto &place = *context.device_context<DeviceContext>().eigen_device();
-
-    out.device(place) = x.binaryExpr(
-        labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
+    auto out_data = Out->mutable_data<T>(context.GetPlace());
+    int limit = Out->numel();
+    auto x_data = X->data<T>();
+    auto label_data = Labels->data<T>();
+    for (int idx = 0; idx < limit; ++idx) {
+      T x = x_data[idx];
+      T label = label_data[idx];
+      if (static_cast<int>(label) == ignore_index) {
+        out_data[idx] = static_cast<T>(0.);
+      } else {
+        T term1 = (x > 0) ? x : 0;
+        T term2 = x * label;
+        T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
+        out_data[idx] = term1 - term2 + term3;
+      }
+    }
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      int norm = 0;
+      T eps = static_cast<T>(1e-6);
+      for (int idx = 0; idx < limit; ++idx) {
+        T diff = label_data[idx] - static_cast<T>(ignore_index);
+        if ((diff < -eps) || (diff > eps)) {
+          norm += 1;
+        }
+      }
+      eps = static_cast<T>(1e-5);
+      norm = norm > eps ? norm : eps;
+      std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; });
+    }
   }
 };
 
@@ -92,19 +73,39 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
     const Tensor *Labels = context.Input<Tensor>("Label");
     const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto ignore_index = context.Attr<int>("ignore_index");
-    auto x = EigenVector<T>::Flatten(*X);
-    auto labels = EigenVector<T>::Flatten(*Labels);
-    auto dout = EigenVector<T>::Flatten(*dOut);
-    auto dx = EigenVector<T>::Flatten(*dX);
-    auto &place =
-        *context.template device_context<DeviceContext>().eigen_device();
+    auto dx_data = dX->mutable_data<T>(context.GetPlace());
 
-    auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
-                                         static_cast<int>(ignore_index)));
-    dx.device(place) = dout * diff;
+    int ignore_index = context.Attr<int>("ignore_index");
+    int limit = dX->numel();
+    auto x_data = X->data<T>();
+    auto label_data = Labels->data<T>();
+    auto dout_data = dOut->data<T>();
+    for (int idx = 0; idx < limit; ++idx) {
+      T x = x_data[idx];
+      T label = label_data[idx];
+      T dout = dout_data[idx];
+      if (static_cast<int>(label) == ignore_index) {
+        dx_data[idx] = static_cast<T>(0.);
+      } else {
+        T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+        T diff = simoid_x - label;
+        dx_data[idx] = dout * diff;
+      }
+    }
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      int norm = 0;
+      T eps = static_cast<T>(1e-6);
+      for (int idx = 0; idx < limit; ++idx) {
+        T diff = label_data[idx] - static_cast<T>(ignore_index);
+        if ((diff < -eps) || (diff > eps)) {
+          norm += 1;
+        }
+      }
+      eps = static_cast<T>(1e-5);
+      norm = norm > eps ? norm : eps;
+      std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; });
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index b993c55fad..031335009b 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -29,8 +29,14 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Xs", "A list of inputs.").AsDuplicable();
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>("calibration_data", "the calibration data for int8");
+    AddAttr<std::string>(
+        "engine_key",
+        "The engine_key here is used to distinguish different TRT Engines");
     AddAttr<int>("max_batch_size", "the maximum batch size.");
     AddAttr<int>("workspace_size", "the workspace size.");
+    AddAttr<framework::BlockDesc *>("sub_block", "the trt block");
+    AddAttr<bool>("enable_int8", "whether swith to int8 mode");
     AddComment("TensorRT engine operator.");
   }
 };
@@ -47,6 +53,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
-                  ops::TensorRTEngineOpMaker);
+                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
 
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 88c4f50847..2ff35c7c6a 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -17,8 +17,10 @@
 #ifdef PADDLE_WITH_CUDA
 
 #include <string>
+#include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
@@ -62,6 +64,9 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
 
 using inference::Singleton;
 using inference::tensorrt::TensorRTEngine;
+using inference::tensorrt::TRTInt8Calibrator;
+using inference::tensorrt::TRTCalibratorEngine;
+using inference::tensorrt::TRTCalibratorEngineManager;
 
 class TensorRTEngineOp : public framework::OperatorBase {
  private:
@@ -70,6 +75,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
   mutable std::unique_ptr<TensorRTEngine> trt_engine_;
   int max_batch_size_;
   int workspace_size_;
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
+  bool enable_int8_;
+  std::string calibration_data_;
+  std::string engine_key_;
+  bool calibration_mode_;
 
  public:
   TensorRTEngineOp(const std::string &type,
@@ -80,26 +90,108 @@ class TensorRTEngineOp : public framework::OperatorBase {
     input_names_ = Inputs("Xs");
     max_batch_size_ = Attr<int>("max_batch_size");
     workspace_size_ = Attr<int>("workspace_size");
+    enable_int8_ = Attr<bool>("enable_int8");
+    calibration_data_ = Attr<std::string>("calibration_data");
+    engine_key_ = Attr<std::string>("engine_key");
 
     auto params = Attr<std::vector<std::string>>("parameters");
     for (const auto &param : params) {
       param_names_.insert(param);
     }
+    // calibration_mode is ture represents we need to
+    // generate the calibration table data.
+    calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0);
+
+    VLOG(4) << "calibration_mode: " << calibration_mode_;
+    if (enable_int8_ && calibration_data_.size()) {
+      calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
+    }
   }
 
  protected:
+  void RunNativeImpl(const framework::Scope &scope,
+                     const platform::Place &dev_place) const {
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>("sub_block");
+    auto *program = block->Program();
+    auto &current_scope = scope.NewScope();
+    auto ctx = executor.Prepare(*program, block->ID());
+    executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
+  }
+
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
+    if (calibration_mode_ == true) {
+      RunCalibration(scope, dev_place);
+      return;
+    }
     RunTrt(scope, dev_place);
   }
 
+  void RunCalibration(const framework::Scope &scope,
+                      const platform::Place &dev_place) const {
+    // This process will builds a 32-bit trt engine, runs it on the calibration
+    // set, and records a histogram for each
+    // tensor of the distribution of activation values.
+    LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_
+                         << " is running calibration trt int8... ";
+    int runtime_batch = 1;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
+    if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
+      TRTCalibratorEngine *calib_res =
+          Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
+      std::unordered_map<std::string, size_t> calib_buffers;
+      for (auto &x : input_names_) {
+        if (param_names_.count(x)) continue;
+        auto &t =
+            inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+        calib_buffers[x] = t.memory_size();
+        auto t_shape = framework::vectorize(t.dims());
+        runtime_batch = t_shape[0];
+      }
+      calib_res->calib_.reset(new TRTInt8Calibrator(
+          calib_buffers, runtime_batch, engine_key_, dev_place));
+      calib_res->thr_.reset(new std::thread([&]() {
+        calib_res->engine_.reset(new TensorRTEngine(
+            max_batch_size_, workspace_size_, stream,
+            boost::get<platform::CUDAPlace>(dev_place).device, enable_int8_,
+            calib_res->calib_.get()));
+        VLOG(3) << "start the calib trt engine thread";
+        Prepare(scope, dev_place, calib_res->engine_.get());
+      }));
+    }
+
+    TRTInt8Calibrator *temp_calibrator =
+        Singleton<TRTCalibratorEngineManager>::Global()
+            .Get(engine_key_)
+            ->calib_.get();
+    std::unordered_map<std::string, void *> calib_data;
+
+    for (auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      calib_data.emplace(x, t.data<void>());
+    }
+    temp_calibrator->setBatch(calib_data);
+    RunNativeImpl(scope, dev_place);
+  }
+
   void RunTrt(const framework::Scope &scope,
               const platform::Place &dev_place) const {
     int runtime_batch = 1;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
     if (trt_engine_.get() == nullptr) {
-      trt_engine_.reset(new TensorRTEngine(
-          max_batch_size_, workspace_size_, nullptr,
-          boost::get<platform::CUDAPlace>(dev_place).device));
+      trt_engine_.reset(
+          new TensorRTEngine(max_batch_size_, workspace_size_, stream,
+                             boost::get<platform::CUDAPlace>(dev_place).device,
+                             enable_int8_, calibrator_.get()));
       Prepare(scope, dev_place, trt_engine_.get());
     }
 
@@ -126,6 +218,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       }
     }
 
+    cudaStreamSynchronize(stream);
     PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
     // Execute the engine.
     engine->Execute(runtime_batch);
@@ -163,12 +256,13 @@ class TensorRTEngineOp : public framework::OperatorBase {
       output_index += 1;
     }
 
-    cudaStreamSynchronize(*engine->stream());
+    cudaStreamSynchronize(stream);
   }
 
   void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
                TensorRTEngine *engine) const {
-    VLOG(4) << "Prepare engine";
+    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+                 "kernel etc). This process may cost a lot of time.";
     framework::proto::BlockDesc block_desc;
     block_desc.ParseFromString(Attr<std::string>("subgraph"));
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 287b0edc96..5a3d9d2c1a 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -96,19 +96,20 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetType("tensorrt_engine");
   engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
-  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
-                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
-  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
-  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
-                                    std::vector<std::string>({}));
-  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
-                                    "output_name_mapping",
-                                    std::vector<std::string>({"z0"}));
+
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
+  engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z0"}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
 
   LOG(INFO) << "create engine op";
-  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
   LOG(INFO) << "engine_op " << engine_op.get();
 
   framework::Scope scope;
@@ -190,20 +191,19 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
 
-  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
-                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
-  SetAttr<std::vector<std::string>>(
-      engine_op_desc.Proto(), "parameters",
-      std::vector<std::string>({"y0", "y1", "y2", "y3"}));
-  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
-
-  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
-                                    "output_name_mapping",
-                                    std::vector<std::string>({"z3"}));
-
-  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters",
+                         std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z3"}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
 
   // Execute them.
   engine_op->Run(scope, place);
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 9f504d14a8..2ce8f141d3 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <cuda.h>
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
@@ -30,6 +31,34 @@ namespace platform {
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
 
+inline static int RoundToPowerOfTwo(int dim) {
+  if (dim > 512) {
+    return 1024;
+  } else if (dim > 256) {
+    return 512;
+  } else if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+
+#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
+  case (dim): {                            \
+    constexpr auto kPowerOfTwoDim = (dim); \
+    __VA_ARGS__;                           \
+  } break
+
+#define CUDA_LAUNCH_KERNEL_HELPER(...)         \
+  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
+
 template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                  int delta, int width = 32) {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 8f80a2d782..2493fb71c0 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -30,8 +30,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
     PADDLE_THROW(
-        "'Place' is not supported, Please re-compile with WITH_GPU "
-        "option");
+        "Place %s is not supported, Please re-compile with WITH_GPU "
+        "option",
+        place);
   }
   return it->second.get().get();
 }
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index ca89d91aad..400a6d7bfa 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 
 #include <algorithm>
+#include <cstdlib>
+#include <string>
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -58,7 +60,18 @@ DEFINE_string(selected_gpus, "",
 namespace paddle {
 namespace platform {
 
-int GetCUDADeviceCount() {
+static int GetCUDADeviceCountImpl() {
+  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
+  if (cuda_visible_devices != nullptr) {
+    std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (std::all_of(cuda_visible_devices_str.begin(),
+                    cuda_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected.";
+      return 0;
+    }
+  }
+
   int count;
   PADDLE_ENFORCE(
       cudaGetDeviceCount(&count),
@@ -66,6 +79,11 @@ int GetCUDADeviceCount() {
   return count;
 }
 
+int GetCUDADeviceCount() {
+  static auto dev_cnt = GetCUDADeviceCountImpl();
+  return dev_cnt;
+}
+
 int GetCUDAComputeCapability(int id) {
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   cudaDeviceProp device_prop;
@@ -203,13 +221,17 @@ size_t GpuMaxChunkSize() {
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream) {
   PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
-                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync "
+                 "(%p -> %p, length: %d)",
+                 src, dst, static_cast<int>(count));
 }
 
 void GpuMemcpySync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind) {
   PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync (%p -> "
+                 "%p, length: %d)",
+                 src, dst, static_cast<int>(count));
 }
 
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index faac6a12c6..269280d604 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -365,7 +365,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     mem_fmt.ndims = axis.size();
     for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
       mem_fmt.dims[i] = nchw_tz[i];  // logical dimensions (nchw format,
-                                     // regardless physical layout)
+      // regardless physical layout)
     }
     mem_fmt.data_type = mkldnn_f32;
     mem_fmt.format = mkldnn_blocked;
@@ -374,7 +374,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     for (int i = nchw_tz.size() - 1; i >= 0; --i) {
       mem_fmt.layout_desc.blocking.padding_dims[i] =
           nchw_tz[i];  // logical dimensions (nchw format, regardless physical
-                       // layout)
+      // layout)
       mem_fmt.layout_desc.blocking.block_dims[i] = 1;
       mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
       mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 9a91ea38ca..803ea6b260 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,10 +1,11 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
   feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
-  tracer)
+  tracer analysis_predictor)
+
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
 endif()
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
 
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
@@ -17,9 +18,9 @@ if(WITH_PYTHON)
       SRCS ${PYBIND_SRCS}
       DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-    if(NOT APPLE AND NOT ANDROID AND NOT WIN32)
+    if(NOT APPLE AND NOT WIN32)
       target_link_libraries(paddle_pybind rt)
-    endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
+    endif(NOT APPLE AND NOT WIN32)
   endif(WITH_AMD_GPU)
 
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index dbc7843caa..31c3bfa43f 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -15,18 +15,38 @@ limitations under the License. */
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
 namespace pybind {
 
 // Bind Methods
-void BindTracer(pybind11::module *m) {
+void BindTracer(pybind11::module* m) {
   pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
       .def("__init__",
-           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+           [](imperative::Tracer& self, framework::BlockDesc* root_block) {
              new (&self) imperative::Tracer(root_block);
            })
-      .def("trace", &imperative::Tracer::Trace)
+      .def("trace",
+           [](imperative::Tracer& self, imperative::OpBase* op,
+              const imperative::VarBasePtrMap& inputs,
+              const imperative::VarBasePtrMap& outputs,
+              framework::BlockDesc* block,
+              const platform::CPUPlace expected_place,
+              const bool stop_gradient = false) {
+             self.Trace(op, inputs, outputs, block, expected_place,
+                        stop_gradient);
+           })
+      .def("trace",
+           [](imperative::Tracer& self, imperative::OpBase* op,
+              const imperative::VarBasePtrMap& inputs,
+              const imperative::VarBasePtrMap& outputs,
+              framework::BlockDesc* block,
+              const platform::CUDAPlace expected_place,
+              const bool stop_gradient = false) {
+             self.Trace(op, inputs, outputs, block, expected_place,
+                        stop_gradient);
+           })
       .def("py_trace", &imperative::Tracer::PyTrace,
            pybind11::return_value_policy::take_ownership);
 }
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
new file mode 100644
index 0000000000..e05667d2c7
--- /dev/null
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -0,0 +1,263 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/inference_api.h"
+#include <pybind11/stl.h>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+using paddle::PaddleDType;
+using paddle::PaddleBuf;
+using paddle::PaddleTensor;
+using paddle::PaddlePlace;
+using paddle::PaddlePredictor;
+using paddle::NativeConfig;
+using paddle::NativePaddlePredictor;
+using paddle::AnalysisPredictor;
+using paddle::contrib::AnalysisConfig;
+
+static void BindPaddleDType(py::module *m);
+static void BindPaddleBuf(py::module *m);
+static void BindPaddleTensor(py::module *m);
+static void BindPaddlePlace(py::module *m);
+static void BindPaddlePredictor(py::module *m);
+static void BindNativeConfig(py::module *m);
+static void BindNativePredictor(py::module *m);
+static void BindAnalysisConfig(py::module *m);
+static void BindAnalysisPredictor(py::module *m);
+
+void BindInferenceApi(py::module *m) {
+  BindPaddleDType(m);
+  BindPaddleBuf(m);
+  BindPaddleTensor(m);
+  BindPaddlePlace(m);
+  BindPaddlePredictor(m);
+  BindNativeConfig(m);
+  BindNativePredictor(m);
+  BindAnalysisConfig(m);
+  BindAnalysisPredictor(m);
+
+  m->def("create_paddle_predictor",
+         &paddle::CreatePaddlePredictor<AnalysisConfig>);
+  m->def("create_paddle_predictor",
+         &paddle::CreatePaddlePredictor<NativeConfig>);
+  m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
+}
+
+void BindPaddleDType(py::module *m) {
+  py::enum_<PaddleDType>(*m, "PaddleDType")
+      .value("FLOAT32", PaddleDType::FLOAT32)
+      .value("INT64", PaddleDType::INT64);
+}
+
+void BindPaddleBuf(py::module *m) {
+  py::class_<PaddleBuf>(*m, "PaddleBuf")
+      .def(py::init<size_t>())
+      .def(py::init([](std::vector<float> &data) {
+        auto buf = PaddleBuf(data.size() * sizeof(float));
+        std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
+        return std::move(buf);
+      }))
+      .def(py::init([](std::vector<int64_t> &data) {
+        auto buf = PaddleBuf(data.size() * sizeof(int64_t));
+        std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
+        return std::move(buf);
+      }))
+      .def("resize", &PaddleBuf::Resize)
+      .def("reset",
+           [](PaddleBuf &self, std::vector<float> &data) {
+             self.Resize(data.size() * sizeof(float));
+             std::memcpy(self.data(), data.data(), self.length());
+           })
+      .def("reset",
+           [](PaddleBuf &self, std::vector<int64_t> &data) {
+             self.Resize(data.size() * sizeof(int64_t));
+             std::memcpy(self.data(), data.data(), self.length());
+           })
+      .def("empty", &PaddleBuf::empty)
+      .def("float_data",
+           [](PaddleBuf &self) -> std::vector<float> {
+             auto *data = static_cast<float *>(self.data());
+             return {data, data + self.length() / sizeof(*data)};
+           })
+      .def("int64_data",
+           [](PaddleBuf &self) -> std::vector<int64_t> {
+             int64_t *data = static_cast<int64_t *>(self.data());
+             return {data, data + self.length() / sizeof(*data)};
+           })
+      .def("length", &PaddleBuf::length);
+}
+
+void BindPaddleTensor(py::module *m) {
+  py::class_<PaddleTensor>(*m, "PaddleTensor")
+      .def(py::init<>())
+      .def_readwrite("name", &PaddleTensor::name)
+      .def_readwrite("shape", &PaddleTensor::shape)
+      .def_readwrite("data", &PaddleTensor::data)
+      .def_readwrite("dtype", &PaddleTensor::dtype)
+      .def_readwrite("lod", &PaddleTensor::lod);
+}
+
+void BindPaddlePlace(py::module *m) {
+  py::enum_<PaddlePlace>(*m, "PaddlePlace")
+      .value("UNK", PaddlePlace::kUNK)
+      .value("CPU", PaddlePlace::kCPU)
+      .value("GPU", PaddlePlace::kGPU);
+}
+
+void BindPaddlePredictor(py::module *m) {
+  auto paddle_predictor = py::class_<PaddlePredictor>(*m, "PaddlePredictor");
+  paddle_predictor
+      .def("run",
+           [](PaddlePredictor &self, const std::vector<PaddleTensor> &inputs) {
+             std::vector<PaddleTensor> outputs;
+             self.Run(inputs, &outputs);
+             return outputs;
+           })
+      .def("get_input_tensor", &PaddlePredictor::GetInputTensor)
+      .def("get_output_tensor", &PaddlePredictor::GetOutputTensor)
+      .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
+      .def("clone", &PaddlePredictor::Clone);
+
+  auto config = py::class_<PaddlePredictor::Config>(paddle_predictor, "Config");
+  config.def(py::init<>())
+      .def_readwrite("model_dir", &PaddlePredictor::Config::model_dir);
+}
+
+void BindNativeConfig(py::module *m) {
+  py::class_<NativeConfig, PaddlePredictor::Config>(*m, "NativeConfig")
+      .def(py::init<>())
+      .def_readwrite("use_gpu", &NativeConfig::use_gpu)
+      .def_readwrite("device", &NativeConfig::device)
+      .def_readwrite("fraction_of_gpu_memory",
+                     &NativeConfig::fraction_of_gpu_memory)
+      .def_readwrite("prog_file", &NativeConfig::prog_file)
+      .def_readwrite("param_file", &NativeConfig::param_file)
+      .def_readwrite("specify_input_name", &NativeConfig::specify_input_name)
+      .def("set_cpu_math_library_num_threads",
+           &NativeConfig::SetCpuMathLibraryNumThreads)
+      .def("cpu_math_library_num_threads",
+           &NativeConfig::cpu_math_library_num_threads);
+}
+
+void BindNativePredictor(py::module *m) {
+  py::class_<NativePaddlePredictor, PaddlePredictor>(*m,
+                                                     "NativePaddlePredictor")
+      .def(py::init<const NativeConfig &>())
+      .def("init", &NativePaddlePredictor::Init)
+      .def("run",
+           [](NativePaddlePredictor &self,
+              const std::vector<PaddleTensor> &inputs) {
+             std::vector<PaddleTensor> outputs;
+             self.Run(inputs, &outputs);
+             return outputs;
+           })
+      .def("get_input_tensor", &NativePaddlePredictor::GetInputTensor)
+      .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
+      .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
+      .def("clone", &NativePaddlePredictor::Clone)
+      .def("scope", &NativePaddlePredictor::scope,
+           py::return_value_policy::reference);
+}
+
+void BindAnalysisConfig(py::module *m) {
+  py::class_<AnalysisConfig> analysis_config(*m, "AnalysisConfig");
+
+  py::enum_<AnalysisConfig::Precision>(analysis_config, "Precision")
+      .value("Float32", AnalysisConfig::Precision::kFloat32)
+      .value("Int8", AnalysisConfig::Precision::kInt8)
+      .export_values();
+
+  analysis_config.def(py::init<const AnalysisConfig &>())
+      .def(py::init<const std::string &>())
+      .def(py::init<const std::string &, const std::string &>())
+      .def("set_model", (void (AnalysisConfig::*)(const std::string &)) &
+                            AnalysisConfig::SetModel)
+      .def("set_model", (void (AnalysisConfig::*)(const std::string &,
+                                                  const std::string &)) &
+                            AnalysisConfig::SetModel)
+      .def("set_prog_file", &AnalysisConfig::SetProgFile)
+      .def("set_params_file", &AnalysisConfig::SetParamsFile)
+      .def("model_dir", &AnalysisConfig::model_dir)
+      .def("prog_file", &AnalysisConfig::prog_file)
+      .def("params_file", &AnalysisConfig::params_file)
+      .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
+           py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
+      .def("disable_gpu", &AnalysisConfig::DisableGpu)
+      .def("use_gpu", &AnalysisConfig::use_gpu)
+      .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
+      .def("memory_pool_init_size_mb",
+           &AnalysisConfig::memory_pool_init_size_mb)
+      .def("fraction_of_gpu_memory_for_pool",
+           &AnalysisConfig::fraction_of_gpu_memory_for_pool)
+      .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim,
+           py::arg("x") = true)
+      .def("ir_optim", &AnalysisConfig::ir_optim)
+      .def("switch_use_feed_fetch_ops", &AnalysisConfig::SwitchUseFeedFetchOps,
+           py::arg("x") = true)
+      .def("use_feed_fetch_ops_enabled",
+           &AnalysisConfig::use_feed_fetch_ops_enabled)
+      .def("switch_specify_input_names",
+           &AnalysisConfig::SwitchSpecifyInputNames, py::arg("x") = true)
+      .def("specify_input_name", &AnalysisConfig::specify_input_name)
+      .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
+           py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
+           py::arg("min_subgraph_size") = 3,
+           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
+      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
+      .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
+           py::arg("x") = true)
+      .def("enable_mkldnn", &AnalysisConfig::EnableMKLDNN)
+      .def("mkldnn_enabled", &AnalysisConfig::mkldnn_enabled)
+      .def("set_cpu_math_library_num_threads",
+           &AnalysisConfig::SetCpuMathLibraryNumThreads)
+      .def("cpu_math_library_num_threads",
+           &AnalysisConfig::cpu_math_library_num_threads)
+      .def("to_native_config", &AnalysisConfig::ToNativeConfig)
+      .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
+      .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
+      .def("model_from_memory", &AnalysisConfig::model_from_memory)
+      .def("pass_builder", &AnalysisConfig::pass_builder,
+           py::return_value_policy::reference);
+}
+
+void BindAnalysisPredictor(py::module *m) {
+  py::class_<AnalysisPredictor, PaddlePredictor>(*m, "AnalysisPredictor")
+      .def(py::init<const AnalysisConfig &>())
+      .def("init", &AnalysisPredictor::Init)
+      .def(
+          "run",
+          [](AnalysisPredictor &self, const std::vector<PaddleTensor> &inputs) {
+            std::vector<PaddleTensor> outputs;
+            self.Run(inputs, &outputs);
+            return outputs;
+          })
+      .def("get_input_tensor", &AnalysisPredictor::GetInputTensor)
+      .def("get_output_tensor", &AnalysisPredictor::GetOutputTensor)
+      .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
+      .def("clone", &AnalysisPredictor::Clone)
+      .def("scope", &AnalysisPredictor::scope,
+           py::return_value_policy::reference);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/inference_api.h b/paddle/fluid/pybind/inference_api.h
new file mode 100644
index 0000000000..c2adfbecf7
--- /dev/null
+++ b/paddle/fluid/pybind/inference_api.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace paddle {
+namespace pybind {
+void BindInferenceApi(pybind11::module *m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index d32fe58f86..24059140ab 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -15,7 +15,9 @@
 #include "paddle/fluid/pybind/ir.h"
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
@@ -24,6 +26,7 @@
 namespace py = pybind11;
 using paddle::framework::ir::Graph;
 using paddle::framework::ir::Node;
+using paddle::framework::ir::GraphSafeRemoveNodes;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 using paddle::framework::VarDesc;
@@ -32,6 +35,7 @@ using pybind11::return_value_policy;
 namespace paddle {
 namespace pybind {
 void BindGraph(py::module *m) {
+  m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes);
   py::class_<Graph, std::shared_ptr<Graph>>(
       *m, "Graph",
       "The graph is a Directed Acyclic Single Static Assignment Graph, see "
@@ -42,6 +46,8 @@ void BindGraph(py::module *m) {
       .def("get_float", &Graph::Get<float>)
       .def("get_double", &Graph::Get<double>)
       .def("get_string", &Graph::Get<std::string>)
+      .def("get_program", &Graph::Get<ProgramDesc>)
+      .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>)
       .def("set", [](Graph &self, const std::string &attr_name,
                      int attr) { return self.Set(attr_name, new int(attr)); })
       .def("set",
@@ -57,6 +63,17 @@ void BindGraph(py::module *m) {
            [](Graph &self, const std::string &attr_name, double attr) {
              return self.Set(attr_name, new double(attr));
            })
+      .def("set",
+           [](Graph &self, const std::string &attr_name,
+              const ProgramDesc &attr) {
+             return self.Set(attr_name, new ProgramDesc(attr));
+           })
+      .def("set",
+           [](Graph &self, const std::string &attr_name,
+              const std::unordered_set<const Node *> &attr) {
+             return self.Set(attr_name,
+                             new std::unordered_set<const Node *>(attr));
+           })
       .def("erase", &Graph::Erase)
       .def("nodes", &Graph::Nodes, return_value_policy::reference)
       .def("create_var_node",
@@ -85,12 +102,52 @@ void BindNode(py::module *m) {
   py::class_<Node> node(*m, "Node");
   node.def("name", &Node::Name)
       .def("node_type", &Node::NodeType)
-      .def("var", &Node::Var)
-      .def("op", &Node::Op)
+      .def("var", &Node::Var, return_value_policy::reference)
+      .def("op", &Node::Op, return_value_policy::reference)
       .def("id", &Node::id)
       .def("is_op", &Node::IsOp)
       .def("is_var", &Node::IsVar)
       .def("is_ctrl_var", &Node::IsCtrlVar)
+      .def("inputs_remove",
+           [](Node &self, int node_id) {
+             for (auto it = self.inputs.begin(); it != self.inputs.end();
+                  it++) {
+               if ((*it)->id() == node_id) {
+                 self.inputs.erase(it);
+               }
+             }
+           })
+      .def("inputs_remove",
+           [](Node &self, Node &node) {
+             for (auto it = self.inputs.begin(); it != self.inputs.end();
+                  it++) {
+               if (*it == &node) {
+                 self.inputs.erase(it);
+               }
+             }
+           })
+      .def("inputs_append",
+           [](Node &self, Node &node) { self.inputs.push_back(&node); })
+      .def("outputs_remove",
+           [](Node &self, int node_id) {
+             for (auto it = self.outputs.begin(); it != self.outputs.end();
+                  it++) {
+               if ((*it)->id() == node_id) {
+                 self.outputs.erase(it);
+               }
+             }
+           })
+      .def("outputs_remove",
+           [](Node &self, Node &node) {
+             for (auto it = self.outputs.begin(); it != self.outputs.end();
+                  it++) {
+               if (*it == &node) {
+                 self.outputs.erase(it);
+               }
+             }
+           })
+      .def("outputs_append",
+           [](Node &self, Node &node) { self.outputs.push_back(&node); })
       .def_readwrite("inputs", &Node::inputs)
       .def_readwrite("outputs", &Node::outputs);
 
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 4b218fb3a2..e729be4a95 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -228,7 +228,7 @@ void BindBlockDesc(pybind11::module *m) {
 
 void BindVarDsec(pybind11::module *m) {
   pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
-  var_desc
+  var_desc.def(pybind11::init<const std::string &>())
       .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
       .def("set_name", &pd::VarDesc::SetName)
       .def("set_shape", &pd::VarDesc::SetShape)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 86b19e9076..cf7e7b951b 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -49,6 +49,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
@@ -137,6 +138,22 @@ PYBIND11_MODULE(core, m) {
       .def("_grad_ivar",
            [](const imperative::VarBase &self) { return self.grads_; },
            py::return_value_policy::reference)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::CPUPlace &place,
+              bool blocking) {
+             std::unique_ptr<imperative::VarBase> new_var =
+                 self.NewVarBase(place, blocking);
+             return new_var.release();
+           },
+           py::return_value_policy::take_ownership)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::CUDAPlace &place,
+              bool blocking) {
+             std::unique_ptr<imperative::VarBase> new_var =
+                 self.NewVarBase(place, blocking);
+             return new_var.release();
+           },
+           py::return_value_policy::take_ownership)
       .def("value", [](const imperative::VarBase &self) { return self.var_; },
            py::return_value_policy::reference)
       .def_property(
@@ -468,6 +485,7 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
+      .def("start", &framework::ReaderHolder::Start)
       .def("reset", &framework::ReaderHolder::ResetAll);
 
   using LoDTensorBlockingQueue =
@@ -488,19 +506,12 @@ All parameter, weight, gradient are variables in Paddle.
       .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
 
   m.def("init_lod_tensor_blocking_queue",
-        [](Variable &var, size_t capacity,
-           const std::vector<std::vector<int64_t>> &shapes)
-            -> std::shared_ptr<LoDTensorBlockingQueue> {
-              std::vector<DDim> dims(shapes.size());
-              std::transform(shapes.begin(), shapes.end(), dims.begin(),
-                             [](const std::vector<int64_t> &shape) {
-                               return make_ddim(shape);
-                             });
-              auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
-              holder->InitOnce(capacity, dims,
-                               FLAGS_reader_queue_speed_test_mode);
-              return holder->GetQueue();
-            },
+        [](Variable &var,
+           size_t capacity) -> std::shared_ptr<LoDTensorBlockingQueue> {
+          auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
+          holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode);
+          return holder->GetQueue();
+        },
         py::return_value_policy::copy);
 
   py::class_<Scope>(m, "_Scope", R"DOC(
@@ -625,7 +636,18 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   py::class_<platform::CUDAPlace>(m, "CUDAPlace")
-      .def(py::init<int>())
+      .def("__init__",
+           [](platform::CUDAPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_CUDA
+             PADDLE_ENFORCE(
+                 dev_id >= 0 && dev_id < platform::GetCUDADeviceCount(),
+                 "Invalid CUDAPlace(%d), must inside [0, %d)", dev_id,
+                 platform::GetCUDADeviceCount());
+             new (&self) platform::CUDAPlace(dev_id);
+#else
+             PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
+#endif
+           })
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
@@ -633,7 +655,12 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
   py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
-      .def(py::init<>())
+      .def("__init__",
+           [](platform::CUDAPinnedPlace &) {
+#ifndef PADDLE_WITH_CUDA
+             PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
+#endif
+           })
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
   py::class_<platform::Place>(m, "Place")
@@ -787,21 +814,33 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
+  m.def("get_pass", [](const py::bytes &binary_str) {
+    std::string pass_type(binary_str);
+    auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
+    return std::shared_ptr<framework::ir::Pass>(std::move(pass));
+  });
 
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
+      .def("has", &ir::Pass::Has)
+      .def("set",
+           [](ir::Pass &self, const std::string &attr_name,
+              const ProgramDesc &attr) {
+             return self.Set(attr_name, new ProgramDesc(attr));
+           })
       .def(
-          "set_str",
+          "set",
           [](ir::Pass &self, const std::string &name, const std::string &attr) {
             self.Set<std::string>(name, new std::string(attr));
           })
-      .def("set_int", [](ir::Pass &self, const std::string &name,
-                         int val) { self.Set<const int>(name, new int(val)); })
+      .def("set", [](ir::Pass &self, const std::string &name,
+                     int val) { self.Set<const int>(name, new int(val)); })
+      .def("get_program", &ir::Pass::Get<ProgramDesc>)
       .def("type", &ir::Pass::Type)
       .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
         std::unique_ptr<ir::Graph> origin_graph(graph.get());
         auto optim_graph = self.Apply(std::move(origin_graph));
-        graph.reset(optim_graph.release());
+        optim_graph.release();
       });
 
   py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
@@ -992,7 +1031,7 @@ All parameter, weight, gradient are variables in Paddle.
             PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.remove_unnecessary_lock_ = b;
           },
-          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
+          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True.)DOC")
       .def_property(
           "num_trainers",
           [](const BuildStrategy &self) { return self.num_trainers_; },
@@ -1087,9 +1126,9 @@ All parameter, weight, gradient are variables in Paddle.
 
   BindRecordIOWriter(&m);
   BindAsyncExecutor(&m);
-
   BindGraph(&m);
   BindNode(&m);
+  BindInferenceApi(&m);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/py_paddle/.gitignore b/paddle/py_paddle/.gitignore
deleted file mode 100644
index 80d1f76fbc..0000000000
--- a/paddle/py_paddle/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-swig_paddle.py
-_swig_paddle.so
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
deleted file mode 100644
index 43614b9779..0000000000
--- a/paddle/py_paddle/dataprovider_converter.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer.PyDataProvider2 as dp2
-import collections
-import swig_paddle
-import numpy
-import itertools
-from functools import reduce
-
-__all__ = ['DataProviderConverter']
-
-
-class IScanner(object):
-    """
-    The scanner will scan Python object two passes, then convert it to Paddle's
-    argument.
-
-    In the first pass, `pre_scan` will be invoked by every data instance, and
-    then invoke `finish_pre_scan` to arguments. And the second pass do the same
-    thing except the functions changed to `scan`, `finish_scan`.
-
-    During the first pass, a scanner may count the shape of input matrix and
-    allocate memory for this argument. Then fill the data into this  argument
-    in second pass.
-    """
-
-    def __init__(self, input_type, pos):
-        self.input_type = input_type
-        if not isinstance(self.input_type, dp2.InputType):
-            raise ValueError("input type should be dataprovider2.InputType")
-        self.pos = pos
-        # data_in_gpu is used to indicate whether to create argument on GPU
-        # or not in GPU mode. Now if using one thread (trainer_count=1),
-        # trainer uses NeuralNetwork which needs to create argument on GPU
-        # before calling forward function. So, set data_in_gpu to True.
-        # Otherwise, trainer uses MultiGradientMachine which will transfer
-        # data from CPU to GPU in the forward function, set data_in_gpu to
-        # False in this case.
-        self.data_in_gpu = swig_paddle.isUsingGpu(
-        ) and swig_paddle.getTrainerCount() == 1
-
-    def pre_scan(self, dat):
-        """
-        First pass scan method. During this method, the scanner could count the
-        data number, and get the total memory size this batch would use.
-
-        :param dat: The python object.
-        """
-        pass
-
-    def finish_pre_scan(self, argument):
-        """
-        Finish first scan pass. Allocate the memory.
-
-        :param argument: Output arguments object.
-        :type argument: swig_paddle.Arguments
-        :param dat: Output arguments object.
-        :type dat: The Python object, numpy.array or List.
-        :return:
-        """
-        pass
-
-    def scan(self, dat):
-        """
-        Second pass scan method. Copy the data to arguments.
-
-        :param dat: The python object.
-        """
-        pass
-
-    def finish_scan(self, argument):
-        """
-        Finish second pass. Finalize the resources, etc.
-
-        :param argument: Output arguments object.
-        :type argument: swig_paddle.Arguments
-        """
-        pass
-
-
-class DenseScanner(IScanner):
-    """
-    :type __mat__: numpy.ndarray
-    """
-
-    def __init__(self, input_type, pos):
-        IScanner.__init__(self, input_type, pos)
-        self.__mat__ = None
-        self.__shape__ = None
-        self.__height__ = 0
-        self.__dim__ = 0
-
-    def pre_scan(self, dat):
-        self.__height__ += 1
-        if self.__shape__ is None:
-            self.__shape__ = numpy.array(dat).shape
-            if len(self.__shape__) > 3:
-                raise ValueError(
-                    "The dimension of input cannot be greater than 3.")
-            if len(self.__shape__) == 0:
-                raise ValueError(
-                    "The input should be a vector, please check your input data."
-                )
-            self.__dim__ = reduce(lambda x, y: x * y, self.__shape__)
-            if len(self.__shape__) == 1 and self.__dim__ != self.input_type.dim:
-                raise ValueError(
-                    "The data size must be equal to it in data layer.")
-        else:
-            if self.__shape__ != numpy.array(dat).shape:
-                raise ValueError(
-                    "The data shape must be same in one mini-batch.")
-
-    def finish_pre_scan(self, argument):
-        self.__mat__ = numpy.ndarray(
-            shape=(self.__height__, self.__dim__), dtype=numpy.float32)
-        self.__height__ = 0
-
-    def scan(self, dat):
-        # It's better to use NumPy array for speed.
-        dat = numpy.array(dat)
-        dat = dat.flatten()
-        self.__mat__[self.__height__] = dat
-        self.__height__ += 1
-
-    def finish_scan(self, argument):
-        assert isinstance(argument, swig_paddle.Arguments)
-        if self.__mat__.dtype != numpy.float32:
-            self.__mat__ = self.__mat__.astype(numpy.float32)
-        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True,
-                                                    self.data_in_gpu)
-        argument.setSlotValue(self.pos, m)
-        if len(self.__shape__) > 1:
-            # The last-two dimenstions are the frame height and width.
-            # For example, the layout is CHW for 3-D feature of image.
-            # The H and W are the frame height and width.
-            h, w = self.__shape__[-2:]
-            argument.setSlotFrameHeight(self.pos, h)
-            argument.setSlotFrameWidth(self.pos, w)
-        self.__shape__ = None
-
-
-class SparseBinaryScanner(IScanner):
-    def __init__(self, input_type, pos):
-        IScanner.__init__(self, input_type, pos)
-        self.__rows__ = [0]
-        self.__cols__ = []
-        self.__height__ = 0
-        self.__value__ = []
-
-    def scan(self, dat):
-        self.extend_cols(dat)
-        self.__rows__.append(len(self.__cols__))
-        self.__height__ += 1
-
-    def extend_cols(self, dat):
-        self.__cols__.extend(dat)
-
-    def finish_scan(self, argument):
-        assert isinstance(argument, swig_paddle.Arguments)
-        m = swig_paddle.Matrix.createSparse(
-            self.__height__,
-            self.input_type.dim,
-            len(self.__cols__),
-            len(self.__value__) == 0,
-            False,  # trans
-            False)  # TODO supoort GPU
-        assert isinstance(m, swig_paddle.Matrix)
-        m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
-        argument.setSlotValue(self.pos, m)
-
-
-class SparseFloatScanner(SparseBinaryScanner):
-    def __init__(self, input_type, pos):
-        SparseBinaryScanner.__init__(self, input_type, pos)
-
-    def extend_cols(self, dat):
-        self.__cols__.extend((x[0] for x in dat))
-        self.__value__.extend((x[1] for x in dat))
-
-
-class IndexScanner(IScanner):
-    def __init__(self, input_type, pos):
-        IScanner.__init__(self, input_type, pos)
-        self.__ids__ = None
-        self.__idx__ = 0
-
-    def pre_scan(self, dat):
-        self.__idx__ += 1
-
-    def finish_pre_scan(self, argument):
-        self.__ids__ = [0] * self.__idx__
-        self.__idx__ = 0
-
-    def scan(self, dat):
-        self.__ids__[self.__idx__] = dat
-        self.__idx__ += 1
-
-    def finish_scan(self, argument):
-        ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
-        assert isinstance(argument, swig_paddle.Arguments)
-        argument.setSlotIds(self.pos, ids)
-
-
-class SequenceScanner(IScanner):
-    def __init__(self, input_type, pos, inner_scanner, setter):
-        IScanner.__init__(self, input_type, pos)
-        self.__seq__ = [0]
-        self.__inner_scanner__ = inner_scanner
-        self.__setter__ = setter
-
-    def pre_scan(self, dat):
-        for each in dat:
-            self.__inner_scanner__.pre_scan(each)
-
-    def finish_pre_scan(self, argument):
-        self.__inner_scanner__.finish_pre_scan(argument)
-
-    def scan(self, dat):
-        self.__seq__.append(self.__seq__[-1] + self.get_size(dat))
-        for each in dat:
-            self.__inner_scanner__.scan(each)
-
-    def finish_scan(self, argument):
-        seq = swig_paddle.IVector.create(self.__seq__, False)
-        self.__setter__(argument, self.pos, seq)
-        self.__inner_scanner__.finish_scan(argument)
-
-    def get_size(self, dat):
-        if isinstance(self.__inner_scanner__, SequenceScanner):
-            return sum(self.__inner_scanner__.get_size(item) for item in dat)
-        else:
-            return len(dat)
-
-
-class DataProviderConverter(object):
-    def __init__(self, input_types):
-        self.input_types = input_types
-        assert isinstance(self.input_types, collections.Sequence)
-        for each in self.input_types:
-            assert isinstance(each, dp2.InputType)
-
-    def convert(self, dat, argument=None):
-        if argument is None:
-            argument = swig_paddle.Arguments.createArguments(0)
-        assert isinstance(argument, swig_paddle.Arguments)
-        argument.resize(len(self.input_types))
-
-        scanners = [
-            DataProviderConverter.create_scanner(i, each_type)
-            for i, each_type in enumerate(self.input_types)
-        ]
-
-        for each_sample in dat:
-            for each_step, scanner in itertools.izip(each_sample, scanners):
-                scanner.pre_scan(each_step)
-
-        for scanner in scanners:
-            scanner.finish_pre_scan(argument)
-
-        for each_sample in dat:
-            for each_step, scanner in itertools.izip(each_sample, scanners):
-                scanner.scan(each_step)
-
-        for scanner in scanners:
-            scanner.finish_scan(argument)
-
-        return argument
-
-    def __call__(self, dat, argument=None):
-        return self.convert(dat, argument)
-
-    @staticmethod
-    def create_scanner(i, each):
-        assert isinstance(each, dp2.InputType)
-        retv = None
-        if each.type == dp2.DataType.Dense:
-            retv = DenseScanner(each, i)
-        elif each.type == dp2.DataType.Index:
-            retv = IndexScanner(each, i)
-        elif each.type == dp2.DataType.SparseNonValue:
-            retv = SparseBinaryScanner(each, i)
-        elif each.type == dp2.DataType.SparseValue:
-            retv = SparseFloatScanner(each, i)
-        assert retv is not None
-
-        if each.seq_type == dp2.SequenceType.SUB_SEQUENCE:
-            retv = SequenceScanner(
-                each, i, retv,
-                lambda a, p, seq: a.setSlotSubSequenceStartPositions(p, seq))
-
-        if each.seq_type in [
-                dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE
-        ]:
-            retv = SequenceScanner(
-                each, i, retv,
-                lambda a, p, seq: a.setSlotSequenceStartPositions(p, seq))
-        return retv
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
deleted file mode 100644
index 3ae8dbf964..0000000000
--- a/paddle/py_paddle/util.py
+++ /dev/null
@@ -1,578 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Some Useful method for py_paddle.
-"""
-
-import swig_paddle
-import os
-import paddle.trainer.PyDataProviderWrapper
-import paddle.proto.ParameterConfig_pb2
-import paddle.proto.ModelConfig_pb2
-import paddle.proto.TrainerConfig_pb2
-import weakref
-import numpy
-import struct
-import sys
-import copy
-
-
-def initializePaddle(*args):
-    """
-    To initialize paddle process.
-    :param args: Command line options, such as --use_gpu=0, etc.
-    :return: Nothing.
-    """
-    old_argv = copy.deepcopy(sys.argv)
-    old_pypath = os.getenv("PYTHONPATH")
-    pypath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-    if old_pypath is not None:
-        pypath = os.pathsep.join([pypath, old_pypath])
-        os.putenv("PYTHONPATH", pypath)
-    args = [""] + list(args)  # argv[0] is command name, it is not important.
-    swig_paddle.__initPaddle__(args)
-    sys.argv = old_argv
-
-
-def __monkeypatch_init_paddle__():
-    swig_paddle.__initPaddle__ = swig_paddle.initPaddle
-    swig_paddle.initPaddle = initializePaddle
-
-
-class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback):
-    """
-    Wrap the python callable object to paddle.UpdateCallback.
-
-    INTERNAL USE ONLY.
-    """
-
-    def __init__(self, callback):
-        swig_paddle.UpdateCallback.__init__(self)
-        self.callback = callback
-
-    def apply(self, param):
-        self.callback(param)
-
-    @staticmethod
-    def wrap(callback):
-        """
-        Cast the python callable object/paddle.UpdateCallback to
-        swig_paddle.UpdateCallback.__disown__
-        :param callback: callable or swig_paddle.UpdateCallback object.
-        """
-        if isinstance(callback, swig_paddle.UpdateCallback):
-            return callback.__disown__()
-        elif isinstance(callback, weakref.ProxyType):
-            raise RuntimeError("Should not pass __disown__ object")
-        else:
-            return __ParameterCallbackWrapper__(callback).__disown__()
-
-
-def __arguments_to_numpy__(i, arg):
-    assert isinstance(arg, swig_paddle.Arguments)
-    value = arg.getSlotValue(i)
-    ids = arg.getSlotIds(i)
-    prob = arg.getSlotIn(i)
-    if value is not None:
-        assert isinstance(value, swig_paddle.Matrix)
-        value = value.copyToNumpyMat()
-    if ids is not None:
-        assert isinstance(ids, swig_paddle.IVector)
-        ids = ids.copyToNumpyArray()
-    if prob is not None:
-        assert isinstance(prob, swig_paddle.Matrix)
-        prob = prob.copyToNumpyMat()
-    return {"value": value, "id": ids, "prob": prob}
-
-
-def __monkeypatch_gradient_machine__():
-    """
-    Add some class methods to GradientMachine.
-    This method should be only used internally.
-    """
-    swig_paddle.GradientMachine.loadFromConfigFile = \
-        staticmethod(loadGradientMachine)
-
-    def __matrix_to_numpy__(m):
-        if isinstance(m, swig_paddle.Matrix):
-            return m.copyToNumpyMat()
-        elif isinstance(m, swig_paddle.IVector):
-            return m.copyToNumpyArra()
-        else:
-            raise RuntimeError("Input arg should be matrix or vecotr.")
-
-    def createFromConfigProto(protoObj,
-                              createMode=swig_paddle.CREATE_MODE_NORMAL,
-                              paramTypes=[
-                                  swig_paddle.PARAMETER_VALUE,
-                                  swig_paddle.PARAMETER_GRADIENT,
-                                  swig_paddle.PARAMETER_MOMENTUM
-                              ]):
-        """
-        Create Gradient Machine From Proto object.
-        :param protoObj: Model config
-        :type protoObj: proto.ModelConfig_pb2.ModelConfig
-        :param createMode: Create Mode, default is normal.
-        :type createMode: int
-        :param paramTypes: the gradient machine parameter type.
-        :type paramTypes: list of int
-        :return: paddle.GradientMachine
-        """
-        assert isinstance(protoObj, paddle.proto.ModelConfig)
-        return swig_paddle.GradientMachine.createByConfigProtoStr(
-            protoObj.SerializeToString(), createMode, paramTypes)
-
-    swig_paddle.GradientMachine.createFromConfigProto = \
-        staticmethod(createFromConfigProto)
-
-    def forwardTest(self, inArgs):
-        """
-        forwardTest. forward gradient machine in test mode, and return a numpy
-        matrix dict.
-
-        :param inArgs: The input arguments
-        :type inArgs: paddle.Arguments
-        :return: A dictionary with keys ['id', 'value'], each value is a
-                 numpy.ndarray.
-        """
-        outArgs = swig_paddle.Arguments.createArguments(0)
-        self.forward(inArgs, outArgs, swig_paddle.PASS_TEST)
-        return [
-            __arguments_to_numpy__(i, outArgs)
-            for i in xrange(outArgs.getSlotNum())
-        ]
-
-    swig_paddle.GradientMachine.forwardTest = forwardTest
-
-    # Monkey patching backward
-    swig_paddle.GradientMachine.__backward__ = swig_paddle.GradientMachine.backward
-
-    def backward(self, callback):
-        """
-        GradientMachine Backward
-        :param callback: a callback which parameter is (paddle.Parameter) or
-                         a paddle.UpdateCallback object.
-        """
-        self.__backward__(__ParameterCallbackWrapper__.wrap(callback))
-
-    swig_paddle.GradientMachine.backward = backward
-
-    # Monkey patching forwardBackward.
-    swig_paddle.GradientMachine.__forwardBackward__ = \
-        swig_paddle.GradientMachine.forwardBackward
-
-    def forwardBackward(self,
-                        inArgs,
-                        outArgs,
-                        passType,
-                        callback=swig_paddle.UpdateCallback()):
-        """
-        GradientMachine forward backward.
-        :param inArgs: Input Arguments for GradientMachine.
-        :type inArgs: paddle.Arguments
-        :param outArgs: Output Arguments for GradientMachine.
-        :type outArgs: paddle.Arguments
-        :param passType: gradient machine's pass type.
-        :type passType: paddle.PassType
-        :param callback: a callable object with arguments (paddle.Parameter) or
-                         a paddle.UpdateCallback it will be called when
-                         backward
-        """
-        self.__forwardBackward__(inArgs, outArgs, passType,
-                                 __ParameterCallbackWrapper__.wrap(callback))
-
-    swig_paddle.GradientMachine.forwardBackward = forwardBackward
-
-    def getParameters(self):
-        return (self.getParameter(i) for i in xrange(self.getParameterSize()))
-
-    swig_paddle.GradientMachine.getParameters = getParameters
-
-    def getNonStaticParameters(self):
-        return (self.getNonStaticParameter(i)
-                for i in xrange(self.getNonStaticParameterSize()))
-
-    swig_paddle.GradientMachine.getNonStaticParameters = getNonStaticParameters
-
-    def getLayerOutputs(self, layerNames):
-        """
-        getLayerOutputs. get outputs of layers and return a numpy matrix dict.
-        :param layerNames: layer names.
-        :type layerNames: string or list.
-        """
-        if isinstance(layerNames, basestring):
-            layerNames = [layerNames]
-        elif not isinstance(layerNames, list):
-            raise RuntimeError("Input args shuld be string or a sting list.")
-
-        output = dict()
-        for name in layerNames:
-            output[name] = __arguments_to_numpy__(0, self.getLayerOutput(name))
-        return output
-
-    swig_paddle.GradientMachine.getLayerOutputs = getLayerOutputs
-
-
-def loadGradientMachine(config_filename, model_dir=None):
-    """
-    Load a gradient machine from config file name/path.
-    :param config_filename: The trainer config file name/path
-    :param model_dir: The model parameter directory. None if same as the
-    directory of config_filename
-    :return: GradientMachine with some enhance methods.
-    :rtype: paddle.GradientMachine
-    """
-    trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
-        config_filename)
-    assert isinstance(trainer_config, swig_paddle.TrainerConfig)
-    model_conf = trainer_config.getModelConfig()
-    network = swig_paddle.GradientMachine.createByModelConfig(model_conf)
-    assert isinstance(network, swig_paddle.GradientMachine)
-    if model_dir is None:
-        model_dir = os.path.dirname(config_filename)
-    network.loadParameters(model_dir)
-    return network
-
-
-def loadParameterFile(fn):
-    """
-    Load Paddle Parameter file to numpy.ndarray
-    :param fn: file name or file like object.
-    :type fn: str or file like object.
-    :return: numpy array
-    :rtype: numpy.ndarray
-    :raise: paddle.UnsupportError when parameter format is wrong.
-    """
-    if isinstance(fn, str):
-        with open(fn, 'rb') as f:
-            return loadParameterFile(f)
-    elif hasattr(fn, 'read'):  # File like object
-        version, = struct.unpack('i', fn.read(4))
-        if version != 0:
-            raise swig_paddle.UnsupportError()
-        value_length, = struct.unpack("I", fn.read(4))
-        if value_length != 4 and value_length != 8:
-            raise swig_paddle.UnsupportError()
-        dtype = 'float32' if value_length == 4 else 'float64'
-        param_size, = struct.unpack("L", fn.read(8))
-        value = numpy.fromfile(fn, dtype)
-        if len(value) != param_size:
-            raise swig_paddle.UnsupportError()
-        return value
-    else:
-        raise swig_paddle.UnsupportError()
-
-
-class DataProviderWrapperConverter(object):
-    """
-    A class convert DataFormat from PyDataProvider Wrapper to
-    py_paddle.paddle.Arguemnts.
-    """
-
-    class DenseValueConverter(object):
-        """
-        Internal class
-        """
-
-        def __init__(self, header_def):
-            self.__dim__ = header_def.dim
-            self.buf = []
-
-        def append(self, other):
-            assert len(other) == self.__dim__
-            self.buf += other
-
-        def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createDense(self.buf,
-                                                 len(self.buf) / self.__dim__,
-                                                 self.__dim__)
-            arg.setSlotValue(slot_idx, mat)
-
-    class IdValueConverter(object):
-        """
-        Internal class
-        """
-
-        def __init__(self, *args):
-            self.buf = []
-
-        def append(self, other):
-            assert isinstance(other, int)
-            self.buf.append(other)
-
-        def __call__(self, slot_idx, arg):
-            arg.setSlotIds(slot_idx, swig_paddle.IVector.create(self.buf))
-
-    class SparseNonValueConverter(object):
-        """
-        Internal class
-        """
-
-        def __init__(self, slot_def):
-            self.indices = [0]
-            self.cols = []
-            self.dim = slot_def.dim
-
-        def append(self, other):
-            self.indices.append(self.indices[-1] + len(other))
-            self.cols += other
-
-        def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createSparse(
-                len(self.indices) - 1, self.dim, len(self.cols), True)
-            assert isinstance(mat, swig_paddle.Matrix)
-            mat.sparseCopyFrom(self.indices, self.cols)
-            self.putIntoArg(slot_idx, arg, mat)
-
-        def putIntoArg(self, slot_idx, arg, mat):
-            arg.setSlotValue(slot_idx, mat)
-
-    class SparseValueConverter(SparseNonValueConverter):
-        """
-        Internal class
-        """
-
-        def __init__(self, slot_def):
-            super(DataProviderWrapperConverter.SparseValueConverter,
-                  self).__init__(slot_def)
-            self.values = []
-
-        def append(self, other):
-            super(DataProviderWrapperConverter.SparseValueConverter,
-                  self).append(map(lambda x: x[0], other))
-            self.values += map(lambda x: x[1], other)
-
-        def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createSparse(
-                len(self.indices) - 1, self.dim, len(self.cols), False)
-            assert isinstance(mat, swig_paddle.Matrix)
-            mat.sparseCopyFrom(self.indices, self.cols, self.values)
-            self.putIntoArg(slot_idx, arg, mat)
-
-    __SLOT_VALUE_CONVERTER_MAP__ = {
-        paddle.trainer.PyDataProviderWrapper.DenseSlot: DenseValueConverter,
-        paddle.trainer.PyDataProviderWrapper.IndexSlot: IdValueConverter,
-        paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot:
-        SparseNonValueConverter,
-        paddle.trainer.PyDataProviderWrapper.SparseValueSlot:
-        SparseValueConverter
-    }
-
-    def __init__(self, use_seq, header):
-        """
-        Ctor
-        :param use_seq: True if use sequence.
-        :param header:  List of slots type,
-                       trainer.PyDataProviderWrapper.SlotType
-        """
-        self.__use_seq__ = use_seq
-        self.__header__ = header
-
-    def convert(self, wrapper_data, argument=None):
-        """
-        Convert PyDataProviderWrapper format to paddle.Argument
-        :param wrapper_data: PyDataProviderWrapper yield's data list.
-        :param argument: The output paddle.Arguments.
-                        If it is not None, it will assign data in this
-                        arguments, else it will create new arguments.
-        :return: arguments that contains data.
-        :rtype: paddle.Arguments
-        """
-        if argument is None:
-            argument = swig_paddle.Arguments.createArguments(0)
-        assert isinstance(argument, swig_paddle.Arguments)
-        argument.resize(len(self.__header__))
-
-        values = map(
-            lambda x: DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[x.__class__](x),
-            self.__header__)
-
-        if self.__use_seq__:
-            seq_dim = [[] for _ in xrange(self.__header__.__len__())]
-            seq_start_pos = [[0] for _ in xrange(self.__header__.__len__())]
-
-            for each_sample in wrapper_data:
-                for slot_idx, sequence in enumerate(each_sample):
-                    for raw_data in sequence:
-                        values[slot_idx].append(raw_data)
-                    seq_start_pos[slot_idx].append(seq_start_pos[slot_idx][-1] +
-                                                   len(sequence))
-                    seq_dim[slot_idx].append(len(sequence))
-
-            for slot_idx in xrange(len(self.__header__)):
-                argument.setSlotSequenceDim(
-                    slot_idx, swig_paddle.IVector.create(seq_dim[slot_idx]))
-                argument.setSlotSequenceStartPositions(
-                    slot_idx,
-                    swig_paddle.IVector.create(seq_start_pos[slot_idx]))
-        else:
-            for each_sample in wrapper_data:
-                for raw_data, value in zip(each_sample, values):
-                    value.append(raw_data)
-
-        for i, v in enumerate(values):
-            v(i, argument)
-
-        return argument
-
-    def __call__(self, wrapper_data, argument=None):
-        """
-        Invoke self.convert. See documents in self.convert.
-        """
-        return self.convert(wrapper_data, argument)
-
-
-def __monkey_patch_protobuf_objects__():
-    def ParameterConfig_toProto(self):
-        """
-        Convert paddle.ParameterConfig to
-        proto.ParameterConfig_pb2.ParameterConfig
-
-        :return: proto.ParameterConfig_pb2.ParameterConfig object.
-        """
-        param_conf = paddle.proto.ParameterConfig_pb2.ParameterConfig()
-        param_conf.ParseFromString(self.toProtoString())
-        return param_conf
-
-    swig_paddle.ParameterConfig.toProto = ParameterConfig_toProto
-
-    def OptimizationConfig_toProto(self):
-        """
-        Convert paddle.OptimizationConfig to
-        proto.TrainerConfig_pb2.OptimizationConfig
-
-        :return: proto.TrainerConfig_pb2.OptimizationConfig
-        """
-        opt_conf = proto.TrainerConfig_pb2.OptimizationConfig()
-        opt_conf.ParseFromString(self.toProtoString())
-        return opt_conf
-
-    swig_paddle.OptimizationConfig.toProto = OptimizationConfig_toProto
-
-    def OptimizationConfig_createFromProto(protoObj):
-        """
-        Create a new paddle.OptimizationConfig from
-        proto.TrainerConfig_pb2.OptimizationConfig
-
-        :param protoObj: proto.TrainerConfig_pb2.OptimizationConfig
-        :return: paddle.OptimizationConfig
-        """
-
-        assert isinstance(protoObj, paddle.proto.OptimizationConfig)
-        return swig_paddle.OptimizationConfig.createFromProtoString(
-            protoObj.SerializeToString())
-
-    swig_paddle.OptimizationConfig.createFromProto = staticmethod(
-        OptimizationConfig_createFromProto)
-
-    def TrainerConfig_createFromProto(protoObj):
-        """
-        Create a new paddle.TrainerConfig from
-        proto.OptimizationConfig
-
-        :param protoObj: proto.TrainerConfig
-        :return: paddle.TrainerConfig
-        """
-        assert isinstance(protoObj, paddle.proto.TrainerConfig)
-        return swig_paddle.TrainerConfig.createFromProtoString(
-            protoObj.SerializeToString())
-
-    swig_paddle.TrainerConfig.createFromProto = staticmethod(
-        TrainerConfig_createFromProto)
-
-
-def __monkey_patch_parameter__():
-    def getBufs(self):
-        """
-        get all parameter vectors.
-        NOTE: the return value is a generator. Maybe you need to cast to
-        list or tuple or something else.
-
-        :return: generator of all parameter vectors.
-        :rtype: generator
-        """
-        return (self.getBuf(i) for i in xrange(swig_paddle.NUM_PARAMETER_TYPES))
-
-    swig_paddle.Parameter.getBufs = getBufs
-
-
-def __monkey_patch_trainer__():
-    swig_paddle.Trainer.__create__ = staticmethod(swig_paddle.Trainer.create)
-
-    def Trainer_create(config, model=None):
-        """
-        Create a trainer for model with TrainerCOnfig trainer_config
-        trainer_config.model_config will be ignored when model is supplied.
-        Trainer.trainOneBatch() and Trainer.forwardOneBatch() can be used only
-        when trainer_config.data_config is set.
-
-        A typical usage for Trainer is:
-        .. code-block:: python
-           trainer = Trainer.create(trainer_config, model)
-           for p in xrange(num_passes)
-               while True:
-                   data = get_next_batch(batch_size)
-                   if not data:
-                       break
-                   trainer.trainOneDataBatch(batch_size, data)
-               trainer.finishTrainPass()
-           trainer.finishTrain()
-
-        The trainer will take care of logging, model saving, distributed
-        training, etc.
-
-        :param config: trainer configuration
-        :type config: paddle.proto.TrainerConfig
-        :param model: the model to be trained
-        :type model: swig_paddle.GradientMachine
-        :return: a trainer
-        :rtype swig_paddle.Trainer
-
-        """
-        assert isinstance(config, paddle.proto.TrainerConfig)
-        if model is not None:
-            assert isinstance(model, swig_paddle.GradientMachine)
-        return swig_paddle.Trainer.__create__(
-            swig_paddle.TrainerConfig.createFromProto(config), model)
-
-    swig_paddle.Trainer.create = staticmethod(Trainer_create)
-
-    swig_paddle.Trainer.__getForwardOutput__ = \
-        swig_paddle.Trainer.getForwardOutput
-
-    def getForwardOutput(self):
-        """
-        Get the netword outputs from the previous trainOneBatch(),
-        trainOneDataBatch(), testOneDataPatch(), or forwardOneBatch() call.
-
-        :return: list of dictionary with keys ['id', 'value'], each value is a
-                 numpy.ndarray.
-        """
-        outArgs = self.__getForwardOutput__()
-        return [
-            __arguments_to_numpy__(i, outArgs)
-            for i in xrange(outArgs.getSlotNum())
-        ]
-
-    swig_paddle.Trainer.getForwardOutput = getForwardOutput
-
-
-def monkeypatches():
-    patches = [
-        __monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
-        __monkey_patch_protobuf_objects__, __monkey_patch_parameter__,
-        __monkey_patch_trainer__
-    ]
-    for patch in patches:
-        patch()
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 9e8b135c1b..6c608fce3c 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -40,7 +40,6 @@ The lastest pre-built build environment images are:
 | Image | Tag |
 | ----- | --- |
 | paddlepaddle/paddle | latest-dev |
-| paddlepaddle/paddle | latest-dev-android |
 
 ### Start Build
 
@@ -68,8 +67,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
 | `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
-| `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
-| `WITH_C_API` | OFF | Build capi libraries for inference. |
 | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index f58e392684..c2156a436e 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -33,7 +33,6 @@ function print_usage() {
     ${BLUE}gen_doc_lib${NONE}: generate paddle documents library
     ${BLUE}html${NONE}: convert C++ source code into HTML
     ${BLUE}dockerfile${NONE}: generate paddle release dockerfile
-    ${BLUE}capi${NONE}: generate paddle CAPI package
     ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
     ${BLUE}check_style${NONE}: run code style check
     ${BLUE}cicheck${NONE}: run CI tasks
@@ -165,6 +164,9 @@ function cmake_gen() {
         INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
     fi
 
+    distibuted_flag=${WITH_DISTRIBUTE:-OFF}
+    grpc_flag=${WITH_GRPC:-${distibuted_flag}}
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -174,15 +176,13 @@ function cmake_gen() {
         -DWITH_DOC=${WITH_DOC:-OFF}
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
+        -DWITH_DISTRIBUTE=${distibuted_flag}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_NGRAPH=${WITH_NGRAPH:-OFF}
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-OFF}
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_C_API=${WITH_C_API:-OFF}
         -DWITH_PYTHON=${WITH_PYTHON:-ON}
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
         -DCUDNN_ROOT=/usr/
         -DWITH_TESTING=${WITH_TESTING:-ON}
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
@@ -197,7 +197,8 @@ function cmake_gen() {
         -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
+        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} 
+        -DWITH_GRPC=${grpc_flag}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -210,14 +211,12 @@ EOF
         -DWITH_DOC=${WITH_DOC:-OFF} \
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
+        -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
         -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-        -DWITH_C_API=${WITH_C_API:-OFF} \
         -DWITH_PYTHON=${WITH_PYTHON:-ON} \
         -DCUDNN_ROOT=/usr/ \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
@@ -232,7 +231,8 @@ EOF
         -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
+        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} \
+        -DWITH_GRPC=${grpc_flag}
 
 }
 
@@ -316,6 +316,46 @@ EOF
     fi
 }
 
+function run_brpc_test() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [[ ${WITH_TESTING:-ON} == "ON" \
+        && ${WITH_DISTRIBUTE:-OFF} == "ON" \
+        && ${WITH_GRPC:-OFF} == "OFF" ]] ; then
+    cat <<EOF
+    ========================================
+    Running brpc unit tests ...
+    ========================================
+EOF
+        set +x
+        declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test" \
+        "rpc_server_test" "varhandle_test" "collective_server_test" "brpc_serde_test")
+        all_tests=`ctest -N`
+
+        for t in "${other_tests[@]}"
+        do
+            if [[ ${all_tests} != *$t* ]]; then
+                continue
+            fi
+
+            if [[ ${TESTING_DEBUG_MODE:-OFF} == "ON" ]] ; then
+                ctest -V -R $t
+            else
+                ctest --output-on-failure -R $t
+            fi
+        done
+        set -x
+
+        if [[ ${TESTING_DEBUG_MODE:-OFF} == "ON" ]] ; then
+            ctest -V -R test_dist_*
+        else
+            ctest --output-on-failure -R test_dist_*
+        fi
+    fi
+}
+
+
+
 function run_mac_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -706,59 +746,43 @@ EOF
 EOF
 }
 
-function gen_capi_package() {
-    if [[ ${WITH_C_API} == "ON" ]]; then
-        capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output
-        rm -rf $capi_install_prefix
-        make DESTDIR="$capi_install_prefix" install
-        cd $capi_install_prefix/
-        ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
-    fi
-}
-
 function gen_fluid_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
-    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
-        cat <<EOF
+    cat <<EOF
     ========================================
     Generating fluid library for train and inference ...
     ========================================
 EOF
-        cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
-        make -j `nproc` fluid_lib_dist
-        make -j `nproc` inference_lib_dist
-      fi
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
+    make -j `nproc` fluid_lib_dist
+    make -j `nproc` inference_lib_dist
 }
 
 function tar_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
-        cat <<EOF
+    cat <<EOF
     ========================================
     Taring fluid library for train and inference ...
     ========================================
 EOF
-        cd ${PADDLE_ROOT}/build
-        cp -r fluid_install_dir fluid
-        tar -czf fluid.tgz fluid
-        cp -r fluid_inference_install_dir fluid_inference
-        tar -czf fluid_inference.tgz fluid_inference
-      fi
+    cd ${PADDLE_ROOT}/build
+    cp -r fluid_install_dir fluid
+    tar -czf fluid.tgz fluid
+    cp -r fluid_inference_install_dir fluid_inference
+    tar -czf fluid_inference.tgz fluid_inference
 }
 
 function test_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
-        cat <<EOF
+    cat <<EOF
     ========================================
     Testing fluid library for inference ...
     ========================================
 EOF
-        cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
-        ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-                 ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
-                 ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
-        ./clean.sh
-      fi
+    cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
+    ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
+             ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
+             ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
+    ./clean.sh
 }
 
 function main() {
@@ -791,11 +815,6 @@ function main() {
       dockerfile)
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
-      capi)
-        cmake_gen ${PYTHON_ABI:-""}
-        build
-        gen_capi_package
-        ;;
       fluid_inference_lib)
         cmake_gen ${PYTHON_ABI:-""}
         gen_fluid_lib
@@ -810,17 +829,20 @@ function main() {
         build
         assert_api_not_changed ${PYTHON_ABI:-""}
         run_test
-        gen_capi_package
         gen_fluid_lib
         test_fluid_lib
         assert_api_spec_approvals
         ;;
+      cicheck_brpc)
+        cmake_gen ${PYTHON_ABI:-""}
+        build
+        run_brpc_test
+        ;;
       assert_api)
         assert_api_not_changed ${PYTHON_ABI:-""}
         assert_api_spec_approvals
         ;;
       test_inference)
-        gen_capi_package
         gen_fluid_lib
         test_fluid_lib
         ;;
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 174c2a12f0..91ca8907c7 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -28,7 +28,6 @@ function start_build_docker() {
         -e WITH_AVX=ON \
         -e WITH_GOLANG=OFF \
         -e WITH_TESTING=ON \
-        -e WITH_C_API=OFF \
         -e WITH_COVERAGE=ON \
         -e COVERALLS_UPLOAD=ON \
         -e WITH_DEB=OFF \
@@ -67,9 +66,6 @@ function main() {
     DOCKER_REPO="paddlepaddle/paddle"
     VERSION="latest-dev"
     PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
-    if [ "$1" == "build_android" ]; then
-        VERSION="latest-dev-android"
-    fi
     IMG=${DOCKER_REPO}:${VERSION}
     start_build_docker $@
 }
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 37ad77549c..59e695e6fc 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -39,7 +39,6 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 IF(WIN32)
     add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
             COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
-            COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
             COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
             COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
             COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
@@ -48,7 +47,6 @@ ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND touch stub.cc
 		COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
-		COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
 		COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 8bdd03fd50..a35a4c5983 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -24,6 +24,8 @@ __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
 
 ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 BuildStrategy = core.ParallelExecutor.BuildStrategy
+InferNativeConfig = core.NativeConfig
+InferAnalysisConfig = core.AnalysisConfig
 
 
 def _place_obj(place):
@@ -70,6 +72,7 @@ class CompiledProgram(object):
         self._executor = None
         self._compiled = False
         self._is_data_parallel = False
+        self._is_inference = False
 
     def with_data_parallel(self,
                            loss_name=None,
@@ -109,10 +112,24 @@ class CompiledProgram(object):
             self._build_strategy = BuildStrategy()
         return self
 
-    def _with_distributed(self):
-        raise NotImplementedError()
+    def with_inference_optimize(self, config):
+        """ Add inference optimize
+
+        Args:
+            config: instance of `NativeConfig` or `AnalysisConfig` to create predictor
+        Returns:
+            self
+        """
+        assert any([
+            isinstance(config, InferNativeConfig),
+            isinstance(config, InferAnalysisConfig)
+        ])
+        self._is_data_parallel = False
+        self._is_inference = True
+        self._infer_config = config
+        return self
 
-    def _with_inference_optimize(self):
+    def _with_distributed(self):
         raise NotImplementedError()
 
     def _compile_data_parallel(self):
@@ -177,6 +194,10 @@ class CompiledProgram(object):
             if self._loss_name else six.u(''), self._scope, self._local_scopes,
             self._exec_strategy, self._build_strategy)
 
+    def _compile_inference(self):
+        assert self._is_data_parallel is False
+        return core.create_paddle_predictor(self._infer_config)
+
     def _compile(self, scope, place):
         """Compile the program based on the configs.
 
@@ -200,6 +221,8 @@ class CompiledProgram(object):
         self._place = place
         if self._is_data_parallel:
             self._executor = self._compile_data_parallel()
+        elif self._is_inference:
+            self._executor = self._compile_inference()
         else:
             p = _place_obj(self._place)
             self._executor = core.Executor(p)
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 24621110b1..6127ca8a3e 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,6 +22,8 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
+from . import reader
+from .reader import *
 from . import slim
 from .slim import *
 from . import utils
@@ -32,5 +34,6 @@ __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
+__all__ += reader.__all__
 __all__ += slim.__all__
 __all__ += utils.__all__
diff --git a/python/paddle/fluid/contrib/int8_inference/__init__.py b/python/paddle/fluid/contrib/int8_inference/__init__.py
new file mode 100644
index 0000000000..eca2dce114
--- /dev/null
+++ b/python/paddle/fluid/contrib/int8_inference/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/contrib/int8_inference/utility.py b/python/paddle/fluid/contrib/int8_inference/utility.py
new file mode 100644
index 0000000000..40de038f28
--- /dev/null
+++ b/python/paddle/fluid/contrib/int8_inference/utility.py
@@ -0,0 +1,732 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid.core as core
+import numpy as np
+import math
+import os
+import paddle.fluid as fluid
+
+
+class Calibrator(object):
+    '''
+    The calibrator class transforms the program and updates the calculated scale into it.
+    This is INT8 v1 calibration tool, mainly for the support of ResNet-50 and MobileNet.
+    '''
+    # TODO(guomingz): Below op list will be updated once more INT8 op kernels are supported.
+    non_conv_int8_op_type = ("pool2d")
+    supported_int8_op_type = ("conv2d", "pool2d")
+    const_sign_op_type = ('pool2d', 'reshape', 'concat', 'transpose')
+    u8_max = 255
+    s8_max = 127
+
+    def __init__(self, *args, **kwargs):
+        self.program = kwargs['program']
+        self.pretrained_model = kwargs['pretrained_model']
+        self.debug = kwargs['debug'] if 'debug' in kwargs else False
+        self.algo = kwargs['algo']
+        self.output = kwargs['output']
+        self.feed_var_names = kwargs['feed_var_names']
+        self.fetch_list = kwargs['fetch_list']
+        self.exe = kwargs['exe']
+
+        self._conv_input_var_name = []
+        self._conv_output_var_name = []
+        self._pool2d_output_var_name = []
+        self._weights_var_name = []
+        self._residual_input_var_name = []
+        self._int8_output_var_op_index_dict = {}
+        self._conv_op_index = [
+            index for index, value in enumerate(self.program.global_block().ops)
+            if value.type == 'conv2d'
+        ]
+
+        self._var_max_value_map = {}
+        self._var_max_range = {}
+        self._weights_scaling_factor = {}
+        self._u8_output_var = []
+        self._s8_output_var = []
+        self._persistable_vars = []
+        self._sampling_data = {}
+
+        self.__init_analysis()
+        self.__generate_output_program()
+
+    def save_int8_model(self):
+        self.__sampling(self._sampling_data)
+        self.__save_scale()
+        self.__update_program()
+        self.__update_output_program_attr()
+        self.__display_debug()
+        self.__save_offline_model()
+
+    def sample_data(self):
+        '''
+        Sampling the tensor data of variable.
+        '''
+        for i in self.sampling_program.list_vars():
+            if i.name in self.sampling_vars:
+                np_data = np.array(fluid.global_scope().find_var(i.name)
+                                   .get_tensor())
+                if i.name not in self._sampling_data:
+                    self._sampling_data[i.name] = []
+                self._sampling_data[i.name].append(np_data)
+
+    def __save_offline_model(self):
+        '''
+        Save the quantized model to the disk.
+        '''
+        fluid.io.save_inference_model(self.output, self.feed_var_names,
+                                      self.fetch_list, self.exe,
+                                      self.sampling_program)
+
+    def __display_debug(self):
+        if self.debug:
+            self.__dot(self._output_program)
+            print(self._output_program)
+
+    def __get_max_range_by_var_name(self, program, var_name):
+        """
+        Check the specified variable was generated from Relu layer or not.
+        If the variable was the output of one of the pool2d/reshape/concat
+        /transpose, we keep trace the ancestor of this variable;
+        If the variable was the output the conv op, we check it's has_relu
+        attr;
+        Otherwise, we return the Calibrator.s8 as default value.
+        Returns:
+            Return Calibrator.u8_max if the variable was generated by Relu,
+            otherwise it will returns Calibrator.s8
+        """
+        search_end_index = -1
+        input_index_name = {}
+        output_index_name = {}
+        ops_type = []
+
+        for index, op in enumerate(program.current_block().ops):
+            ops_type.append(op.type)
+
+            input_index_name[index] = op.input_arg_names
+
+            output_index_name[index] = op.output_arg_names
+            if var_name in op.output_arg_names:
+                search_end_index = index
+
+        # analysis
+        while search_end_index >= 0:
+            if ops_type[search_end_index] == "relu":
+                return Calibrator.u8_max
+
+            input_name = input_index_name[search_end_index][0]
+
+            for i in output_index_name.keys():
+                if input_name in output_index_name[i]:
+                    search_end_index = i
+                    break
+
+            if ops_type[
+                    search_end_index] not in Calibrator.const_sign_op_type and ops_type[
+                        search_end_index] != 'conv2d':
+                return Calibrator.s8_max
+
+            if ops_type[search_end_index] != 'conv2d':
+                continue
+
+            if program.current_block().ops[search_end_index].has_attr(
+                    'fuse_relu') and program.current_block().ops[
+                        search_end_index].attr('fuse_relu'):
+                return Calibrator.u8_max
+            else:
+                return Calibrator.s8_max
+
+        return Calibrator.s8_max
+
+    def __check_op_type_with_specified_var_as_input(self,
+                                                    program,
+                                                    var_name,
+                                                    start_index=0):
+        '''
+        Check whether all the type of ops that use the specified variable as the
+        input.If one of those op is not int8-enabled, return False.
+        '''
+        op_type_list = [
+            op.type for op in program.current_block().ops[start_index:]
+            if var_name in op.input_arg_names
+        ]
+        for i in op_type_list:
+            if not i in Calibrator.supported_int8_op_type:
+                return False
+        return True
+
+    def __check_var_source_dt(self, var_name):
+        '''
+        Check whether the specified variable is the output of int8 conv op or not.
+        If true, return the original op index.
+        If false, return -1
+        '''
+        return self._int8_output_var_op_index_dict[
+            var_name] if var_name in self._int8_output_var_op_index_dict else -1
+
+    def __update_int8_output_var_op_index_dict(self, index, var_name=None):
+        '''
+        Update the int8_output_variable/op_index dictionary
+        '''
+        for k, v in self._int8_output_var_op_index_dict.items():
+            if v >= index:
+                self._int8_output_var_op_index_dict[k] = v + 1
+        if var_name:
+            self._int8_output_var_op_index_dict[var_name] = index
+
+    def __update_program(self):
+        '''
+        Update the program with the quantize/dequantize op insertion.
+        '''
+        quantize_index, dequantize_index = self.__get_quantize_dequantize_combination(
+            self._output_program)
+        inserted_op_length = 0
+        calc_max_func = self.__get_optimal_scaling_factor if self.algo == "KL" else np.max
+        insert_op_collection = sorted(quantize_index + dequantize_index)
+
+        for index in insert_op_collection:
+            if index in quantize_index:
+                quantize_tmp = self._output_program.current_block().create_var(
+                    name="quantize_{}_tmp".format(index),
+                    dtype=core.VarDesc.VarType.UINT8)
+                original_out_name = self._output_program.current_block().ops[
+                    index + inserted_op_length - 1].output_names[0]
+                original_out = self._output_program.current_block().ops[
+                    index + inserted_op_length - 1].output(original_out_name)[0]
+
+                op = self._output_program.current_block()._insert_op(
+                    index=index + inserted_op_length,
+                    type="quantize",
+                    inputs={"Input": original_out},
+                    outputs={"Output": quantize_tmp}, )
+
+                op._set_attr("data_format", "MKLDNNLAYOUT")
+                op._set_attr("use_mkldnn", 1)
+                op._set_attr(
+                    "Scale", self._var_max_range[original_out] /
+                    calc_max_func(self._var_max_value_map[original_out]))
+
+                if self.__get_max_range_by_var_name(
+                        self._output_program,
+                        original_out) == Calibrator.s8_max:
+                    op._set_attr("is_negative_input", 1)
+
+                self.__update_int8_output_var_op_index_dict(
+                    index + inserted_op_length, "quantize_{}_tmp".format(index))
+
+                inserted_op_length += 1
+                for op in self._output_program.current_block().ops[
+                        index + inserted_op_length:]:
+                    for j in op.input_names:
+                        if op.input(j) and op.input(
+                                j
+                        )[0] == original_out and op.type in Calibrator.supported_int8_op_type:
+                            op.desc.set_input(j,
+                                              ["{}".format(quantize_tmp.name)])
+            else:
+                start_index = index + inserted_op_length
+                dequantize_tmp_var = self._output_program.current_block(
+                ).create_var(
+                    name="dequantize_{}_tmp".format(index + 1),
+                    dtype="float32", )
+                original_out_var = None
+
+                for original_input in self._output_program.current_block().ops[
+                        start_index].input_arg_names:
+                    index_res = self.__get_op_index_by_output_var(
+                        self._output_program, original_input)
+                    if index_res != -1:
+                        original_out_var = original_input
+                        break
+
+                if original_out_var:
+                    op = self._output_program.current_block()._insert_op(
+                        index=start_index,
+                        type="dequantize",
+                        inputs={"Input": original_out_var},
+                        outputs={"Output": dequantize_tmp_var})
+                    op._set_attr("data_format", "MKLDNNLAYOUT")
+                    op._set_attr("use_mkldnn", 1)
+                    op._set_attr("Scale", self._var_max_range[original_out_var]
+                                 / calc_max_func(self._var_max_value_map[
+                                     original_out_var]))
+
+                    for op_index in range(
+                            start_index + 1,
+                            len(self._output_program.current_block().ops)):
+                        if self._output_program.current_block(
+                        ).ops[op_index].type == "conv2d" and self._output_program.current_block(
+                        ).ops[op_index].attr("force_fp32_output"):
+                            continue
+                        else:
+                            for j in self._output_program.current_block().ops[
+                                    op_index].input_names:
+                                if len(self._output_program.current_block().ops[
+                                        op_index].input(j)
+                                       ) and self._output_program.current_block(
+                                       ).ops[op_index].input(j)[
+                                           0] == original_out_var:
+                                    self._output_program.current_block(
+                                    ).ops[op_index].desc.set_input(
+                                        j,
+                                        ["{}".format(dequantize_tmp_var.name)])
+
+                    inserted_op_length += 1
+
+                    op._set_attr("data_format", "MKLDNNLAYOUT")
+                    op._set_attr("use_mkldnn", 1)
+
+    def __update_output_program_attr(self):
+        for i in self._output_program.list_vars():
+            if i.name in self._persistable_vars:
+                i.persistable = False
+                os.system("rm -rf {}/{}".format(self.pretrained_model, i.name))
+
+        for i in self._u8_output_var:
+            self._output_program.current_block().var(i).desc.set_dtype(
+                core.VarDesc.VarType.UINT8)
+
+        for i in self._s8_output_var:
+            self._output_program.current_block().var(i).desc.set_dtype(
+                core.VarDesc.VarType.INT8)
+
+    @property
+    def sampling_program(self):
+        return self._output_program
+
+    @property
+    def sampling_vars(self):
+        return self._weights_var_name + self._conv_input_var_name + self._conv_output_var_name + self._residual_input_var_name + self._pool2d_output_var_name
+
+    def _is_close(self, a, b, rel_tol=1e-09, abs_tol=0.0):
+        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
+    def __generate_output_program(self):
+        for i in self.program.list_vars():
+            if not i.persistable and i.name in self.sampling_vars:
+                i.persistable = True
+                self._persistable_vars.append(i.name)
+
+        self._output_program = self.program.clone()
+
+    def __save_scale(self):
+        '''
+        Update the convolution scale information.
+        '''
+        func = self.__get_optimal_scaling_factor if self.algo == 'KL' else np.max
+        for i in self._conv_op_index[1:]:
+            weights_var_name = self.program.current_block().ops[i].input(
+                'Filter')[0]
+            input_var_name = self.program.current_block().ops[i].input('Input')[
+                0]
+            output_var_name = self.program.current_block().ops[i].output(
+                'Output')[0]
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_weights", self._weights_scaling_factor[weights_var_name])
+
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_in", self._var_max_range[input_var_name] /
+                func(self._var_max_value_map[input_var_name]))
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_out", self._var_max_range[output_var_name] /
+                func(self._var_max_value_map[output_var_name]))
+            if self._output_program.current_block().ops[i].desc.input(
+                    "ResidualData"):
+                residual_var_name = self._output_program.current_block().ops[
+                    i].desc.input("ResidualData")[0]
+                self._output_program.current_block().ops[i]._set_attr(
+                    "Scale_in_eltwise", self._var_max_range[residual_var_name] /
+                    func(self._var_max_value_map[residual_var_name]))
+
+    def __sampling(self, sampling_data):
+        '''
+        Sampling the variables data range.
+        '''
+        for i in self.program.list_vars():
+            if i.name not in self.sampling_vars:
+                continue
+
+            if i.name in self._weights_var_name:
+                scaling_factor_per_channel = []
+                data = sampling_data[i.name][0]
+                for j in range(data.shape[0]):
+                    var_value = float(np.max(np.abs(data[j])))
+                    if not self._is_close(var_value, 0.0):
+                        scaling_factor_per_channel.append(Calibrator.s8_max /
+                                                          var_value)
+                    else:
+                        scaling_factor_per_channel.append(0.0)
+                self._weights_scaling_factor[
+                    i.name] = scaling_factor_per_channel
+            else:
+                if i.name in self._conv_output_var_name:
+                    op_pos = self.__get_op_index_by_output_var(self.program,
+                                                               i.name)
+                    cur_op = self.program.current_block().ops[op_pos]
+
+                    if cur_op.has_attr('fuse_relu') and cur_op.attr(
+                            'fuse_relu'):
+                        max_range = Calibrator.u8_max
+                        self._u8_output_var.append(i.name)
+                    else:
+                        max_range = Calibrator.s8_max
+                        self._s8_output_var.append(i.name)
+                else:
+                    max_range = self.__get_max_range_by_var_name(self.program,
+                                                                 i.name)
+                max_value = [[np.abs(np_data)]
+                             for np_data in sampling_data[i.name]]
+
+                self._var_max_range[i.name] = max_range
+                self._var_max_value_map[i.name] = max_value
+
+    def __check_force_fp32_attr_by_output_var(self, program, var_name):
+        for op in program.current_block().ops:
+            if op.type == "conv2d" and var_name in op.output_arg_names:
+                return op.attr("force_fp32_output")
+        return False
+
+    def __get_op_index_by_output_var(self, program, var_name, start_index=0):
+        '''
+        Check whether the specified input variable is the output of the
+        conv/pool2d op's output or not.
+
+        Returns:
+            The index if the variable is the output of any conv/pool2d op's
+            output.
+            -1 when the variable is not the output of any conv/pool2d op's 
+            output.
+        '''
+        for index, op in enumerate(program.current_block().ops[start_index:]):
+            if var_name in op.output_arg_names and op.type in Calibrator.supported_int8_op_type:
+                return index
+        return -1
+
+    def __get_op_index_by_input_var(self, program, var_name, start_index=0):
+        '''
+        Get the op index by specified input variable.
+        Returns:
+            The op index if the variable is the input of this op or -1 if the 
+            variable is not the input of any op. 
+        '''
+        for index, op in enumerate(program.current_block().ops[start_index:]):
+            if var_name in op.input_arg_names:
+                return index
+
+        return -1
+
+    def __get_quantize_dequantize_combination(self, program):
+        """
+        Get the quantize/dequantize op index for further inserting.
+        Args:
+            The program desc.
+        Returns:
+            Two lists contains the quantize op and dequantize op index information.
+        """
+        quantize_op_index = []
+        dequantize_op_index = []
+        minimal_conv_count = 2  # there must be two conv ops if not enable the first conv int8.
+        if len(self._conv_op_index) < minimal_conv_count:
+            return [], []
+
+        for index, value in enumerate(self._conv_op_index):
+            if index == 0:
+                quantize_op_index.append(self._conv_op_index[index + 1])
+            elif index == len(self._conv_op_index) - 1:
+                output_var = program.current_block().ops[value].output(
+                    "Output")[0]
+                if self.__check_op_type_with_specified_var_as_input(
+                        program, output_var, index):
+                    dequantize_op_index.append(self._conv_op_index[index] + 2)
+                else:
+                    program.current_block().ops[value]._set_attr(
+                        "force_fp32_output", True)
+
+            elif self._conv_op_index[index] + 1 < self._conv_op_index[index +
+                                                                      1]:
+
+                program.current_block().ops[self._conv_op_index[
+                    index]]._set_attr("force_fp32_output", True)
+
+                for op_index in range(self._conv_op_index[index + 1],
+                                      self._conv_op_index[index], -1):
+                    op_type = program.current_block().ops[op_index].type
+                    op_has_int8_input = False
+                    input_var_name = None
+                    input_length = len(program.current_block().ops[op_index]
+                                       .input_arg_names)
+
+                    for var_name in program.current_block().ops[
+                            op_index].input_arg_names:
+                        if self.__check_var_source_dt(var_name) != -1:
+                            op_has_int8_input = True
+                            input_var_name = var_name
+                            break
+
+                    if op_has_int8_input:
+                        if op_type == "conv2d":
+                            if program.current_block().ops[op_index +
+                                                           1].type == "conv2d":
+                                continue
+                            elif program.current_block(
+                            ).ops[op_index +
+                                  1].type in Calibrator.non_conv_int8_op_type:
+                                dequantize_op_index.append(op_index + 2)
+                                break
+                            else:
+                                program.current_block().ops[op_index]._set_attr(
+                                    "force_fp32_output", True)
+                                continue
+                        elif not self.__check_force_fp32_attr_by_output_var(
+                                program, input_var_name
+                        ) and op_index not in dequantize_op_index:
+                            share_input_flag = True
+                            for input_attr_name in program.current_block().ops[
+                                    op_index].input_names:
+                                input_var_name = program.current_block().ops[
+                                    op_index].input(input_attr_name)[0]
+                                cousin_op_index = self.__get_op_index_by_input_var(
+                                    program, input_var_name)
+                                if cousin_op_index != -1 and cousin_op_index in dequantize_op_index:
+                                    share_input_flag = False
+                                    break
+                            if share_input_flag:
+                                dequantize_op_index.append(op_index)
+
+                    elif input_length:
+                        output_is_to_int8_op = False
+                        share_input_flag = True
+                        for var_name in program.current_block().ops[
+                                op_index].input_arg_names:
+                            if not self.__check_op_type_with_specified_var_as_input(
+                                    program, var_name):
+                                share_input_flag = False
+                                break
+
+                        for var_name in program.current_block().ops[
+                                op_index].output_arg_names:
+                            if self.__get_op_index_by_output_var(
+                                    program, var_name, op_index) != -1:
+                                output_is_to_int8_op = True
+                                break
+
+                        if share_input_flag or output_is_to_int8_op:
+                            quantize_op_index.append(op_index)
+
+        return quantize_op_index, dequantize_op_index
+
+    def __init_analysis(self):
+        '''
+        Collect the variable names for sampling.
+        '''
+        start_index = 1  #analysis the conv op detail from second conv op.
+
+        for i in self._conv_op_index[start_index:]:
+            self._weights_var_name.append(self.program.current_block().ops[i]
+                                          .input('Filter')[0])
+            self._conv_input_var_name.append(self.program.current_block().ops[i]
+                                             .input('Input')[0])
+            self._conv_output_var_name.append(self.program.current_block().ops[
+                i].output('Output')[0])
+            self._int8_output_var_op_index_dict[self.program.current_block()
+                                                .ops[i].output('Output')[0]] = i
+            if self.program.current_block().ops[i].desc.input("ResidualData"):
+                self._residual_input_var_name.append(self.program.current_block(
+                ).ops[i].desc.input("ResidualData")[0])
+
+            if self.program.current_block().ops[i + 1].type == "pool2d":
+                self._pool2d_output_var_name.append(self.program.current_block(
+                ).ops[i + 1].output('Out')[0])
+
+    def __expand_quantized_bins(self, quantized_bins, reference_bins):
+        expanded_quantized_bins = [0] * len(reference_bins)
+        num_merged_bins = len(reference_bins) / len(quantized_bins)
+        j_start = 0
+        j_end = num_merged_bins
+        for idx in xrange(len(quantized_bins)):
+            zero_count = reference_bins[j_start:j_end].count(0)
+            num_merged_bins = j_end - j_start
+            if zero_count == num_merged_bins:
+                avg_bin_ele = 0
+            else:
+                avg_bin_ele = quantized_bins[idx] / (
+                    num_merged_bins - zero_count + 0.0)
+            for idx1 in xrange(j_start, j_end):
+                expanded_quantized_bins[idx1] = (0 if reference_bins[idx1] == 0
+                                                 else avg_bin_ele)
+            j_start += num_merged_bins
+            j_end += num_merged_bins
+            if (idx + 1) == len(quantized_bins) - 1:
+                j_end = len(reference_bins)
+        return expanded_quantized_bins
+
+    def __safe_entropy(self, reference_distr_P, P_sum, candidate_distr_Q,
+                       Q_sum):
+        '''
+        Calculate the entropy.
+        '''
+        assert len(reference_distr_P) == len(candidate_distr_Q)
+        tmp_sum1 = 0
+        tmp_sum2 = 0
+        for idx in range(len(reference_distr_P)):
+            p_idx = reference_distr_P[idx]
+            q_idx = candidate_distr_Q[idx]
+            if p_idx == 0:
+                tmp_sum1 += 0
+                tmp_sum2 += 0
+            else:
+                if q_idx == 0:
+                    print("Fatal error!, idx = " + str(idx) +
+                          " qindex = 0! p_idx = " + str(p_idx))
+                tmp_sum1 += p_idx * (math.log(Q_sum * p_idx))
+                tmp_sum2 += p_idx * (math.log(P_sum * q_idx))
+        return (tmp_sum1 - tmp_sum2) / P_sum
+
+    # Reference: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+    def __get_optimal_scaling_factor(self,
+                                     activation_blob,
+                                     num_quantized_bins=255):
+        '''
+        Using the KL-divergenc method to get the more precise scaling factor.
+        '''
+        max_val = np.max(activation_blob)
+        min_val = np.min(activation_blob)
+        if min_val >= 0:
+            hist, hist_edeges = np.histogram(
+                activation_blob, bins=2048, range=(min_val, max_val))
+            ending_iter = 2047
+            starting_iter = int(ending_iter * 0.7)
+        else:
+            th = max(abs(max_val), abs(min_val))
+            hist, hist_edeges = np.histogram(
+                activation_blob, bins=2048, range=(-th, th))
+            starting_iter = 0
+            ending_iter = 2047
+            if abs(max_val) > abs(min_val):
+                while starting_iter < ending_iter:
+                    if hist[starting_iter] == 0:
+                        starting_iter += 1
+                        continue
+                    else:
+                        break
+                starting_iter += int((ending_iter - starting_iter) * 0.6)
+            else:
+                while ending_iter > 0:
+                    if hist[ending_iter] == 0:
+                        ending_iter -= 1
+                        continue
+                    else:
+                        break
+                starting_iter = int(0.6 * ending_iter)
+        bin_width = hist_edeges[1] - hist_edeges[0]
+        P_sum = len(activation_blob)
+        min_kl_divergence = 0
+        min_kl_index = 0
+        kl_inited = False
+        for i in range(starting_iter, ending_iter + 1):
+            reference_distr_P = hist[0:i].tolist()
+            outliers_count = sum(hist[i:2048])
+            if reference_distr_P[i - 1] == 0:
+                continue
+            reference_distr_P[i - 1] += outliers_count
+            reference_distr_bins = reference_distr_P[:]
+            candidate_distr_Q = hist[0:i].tolist()
+            num_merged_bins = i / num_quantized_bins
+            candidate_distr_Q_quantized = [0] * num_quantized_bins
+            j_start = 0
+            j_end = num_merged_bins
+            for idx in xrange(num_quantized_bins):
+                candidate_distr_Q_quantized[idx] = sum(candidate_distr_Q[
+                    j_start:j_end])
+                j_start += num_merged_bins
+                j_end += num_merged_bins
+                if (idx + 1) == num_quantized_bins - 1:
+                    j_end = i
+            candidate_distr_Q = self.__expand_quantized_bins(
+                candidate_distr_Q_quantized, reference_distr_bins)
+            Q_sum = sum(candidate_distr_Q)
+            kl_divergence = self.__safe_entropy(reference_distr_P, P_sum,
+                                                candidate_distr_Q, Q_sum)
+            if not kl_inited:
+                min_kl_divergence = kl_divergence
+                min_kl_index = i
+                kl_inited = True
+            elif kl_divergence < min_kl_divergence:
+                min_kl_divergence = kl_divergence
+                min_kl_index = i
+            else:
+                pass
+        if min_kl_index == 0:
+            while starting_iter > 0:
+                if hist[starting_iter] == 0:
+                    starting_iter -= 1
+                    continue
+                else:
+                    break
+            min_kl_index = starting_iter
+        return (min_kl_index + 0.5) * bin_width
+
+    @staticmethod
+    def __dot(program, output_name="model.dot"):
+        '''
+        Generate the graphiz dot file for debugging.
+        '''
+        dot_graph = ""
+        dot_nodes = []
+        dot_edges = []
+        dot_graph += "digraph pm {\n"
+        for block in program.blocks:
+            ops = list(block.ops)
+            for index, op in enumerate(ops):
+                op_type = op.type
+                op_name = op_type + "_" + op.output_arg_names[0].replace(
+                    ".", "_") + "___" + str(index)
+                for name in op.input_arg_names:
+                    name = name.replace(".", "_")
+                    dot_edge = name + " -> " + op_name
+                    if dot_edge not in dot_edges:
+                        dot_edges.append(dot_edge)
+                    dot_node = name + " [shape=oval, style=filled, fillcolor=yellow]"
+                    if dot_node not in dot_nodes:
+                        dot_nodes.append(dot_node)
+
+                for name in op.output_arg_names:
+                    name = name.replace(".", "_")
+                    dot_edge = op_name + " -> " + name
+                    if dot_edge not in dot_edges:
+                        dot_edges.append(dot_edge)
+                if op_type in Calibrator.supported_int8_op_type:
+                    if op_type == "conv2d" and op.has_attr(
+                            'force_fp32_output') and op.attr(
+                                "force_fp32_output"):
+                        dot_node = op_name + " [shape=box, style=filled, color=deeppink]"
+                    else:
+                        dot_node = op_name + " [shape=box, style=filled, color=greenyellow]"
+                elif op_type in ["quantize", "dequantize"]:
+                    dot_node = op_name + " [shape=box, style=filled, color=gold]"
+                else:
+                    dot_node = op_name + " [shape=box, style=filled, fillcolor=red]"
+
+                if dot_node not in dot_nodes:
+                    dot_nodes.append(dot_node)
+
+        for dot_edge in dot_edges:
+            dot_graph += dot_edge + "\n"
+        for dot_node in dot_nodes:
+            dot_graph += dot_node + "\n"
+        dot_graph += "}"
+
+        with open(output_name, 'w') as f:
+            f.write(dot_graph)
diff --git a/python/paddle/fluid/contrib/reader/README.md b/python/paddle/fluid/contrib/reader/README.md
new file mode 100644
index 0000000000..9e4b7d1ce3
--- /dev/null
+++ b/python/paddle/fluid/contrib/reader/README.md
@@ -0,0 +1,15 @@
+## CTR READER
+
+An multi-thread cpp reader that has the same interface with py_reader. It
+uses cpp multi-thread to read file and is much more faster then the Python read
+thread in py_reader.
+
+Currently, it support two types of file:
+ - gzip
+ - plain text file
+
+and two types of data format:
+ - cvs data format is :
+   * label dense_fea,dense_fea sparse_fea,sparse_fea
+ - the svm data format is :
+   * label slot1:fea_sign slot2:fea_sign slot1:fea_sign
diff --git a/paddle/py_paddle/__init__.py b/python/paddle/fluid/contrib/reader/__init__.py
similarity index 62%
rename from paddle/py_paddle/__init__.py
rename to python/paddle/fluid/contrib/reader/__init__.py
index 5504d1d50c..4cf85ffc16 100644
--- a/paddle/py_paddle/__init__.py
+++ b/python/paddle/fluid/contrib/reader/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from util import DataProviderWrapperConverter
-from dataprovider_converter import DataProviderConverter
+from __future__ import print_function
 
-__all__ = [
-    'paddle',
-    'DataProviderConverter',
-    'DataProviderWrapperConverter',  # for deprecated usage.
-    'loadParameterFile'
-]
-util.monkeypatches()
+from . import ctr_reader
+
+__all__ = ctr_reader.__all__
diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py
index b8449e8d84..44e8647f8c 100644
--- a/python/paddle/fluid/contrib/reader/ctr_reader.py
+++ b/python/paddle/fluid/contrib/reader/ctr_reader.py
@@ -20,6 +20,8 @@ from paddle.fluid.framework import default_main_program, \
     default_startup_program, Variable
 from paddle.fluid.unique_name import generate as unique_name
 
+__all__ = ['ctr_reader']
+
 
 def monkey_patch_reader_methods(reader):
     def __get_reader__():
@@ -30,7 +32,11 @@ def monkey_patch_reader_methods(reader):
     def reset():
         return __get_reader__().reset()
 
+    def start():
+        return __get_reader__().start()
+
     reader.reset = reset
+    reader.start = start
     reader.stop_gradient = True
     reader.persistable = True
     return reader
@@ -44,13 +50,18 @@ def _copy_reader_var_(block, var):
     return new_var
 
 
-def ctr_reader(feed_data,
-               capacity,
-               thread_num,
-               batch_size,
-               file_list,
-               slots,
-               name=None):
+def ctr_reader(
+        feed_dict,
+        file_type,  # gzip or plain
+        file_format,  # csv or svm
+        dense_slot_index,
+        sparse_slot_index,
+        capacity,
+        thread_num,
+        batch_size,
+        file_list,
+        slots,
+        name=None):
     """
     Create a CTR reader for data feeding in Python
 
@@ -67,12 +78,21 @@ def ctr_reader(feed_data,
     Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
 
     Args:
+       feed_dict(list(variable)): a list of data variable.
+       file_type('gzip'|'plain'): the type of the data file
+       file_format('csv'|'svm'): csv data or svm data format.
+        cvs data format is :
+            label dense_fea,dense_fea sparse_fea,sparse_fea
+        the svm data format is :
+            label slot1:fea_sign slot2:fea_sign slot1:fea_sign
+       dense_slot_index(list(int)): the index of dense slots
+       sparse_slot_index(list(int)): the index of sparse slots
        capacity(int): The buffer capacity maintained by :code:`py_reader`.
-       thread_num(list|tuple): List of tuples which declaring data shapes.
-       batch_size(list|tuple): List of strs which declaring data type.
-       file_list(list|tuple): List of ints which declaring data lod_level.
-       slots(bool): Whether use double buffer or not.
-       name(basestring): The prefix Python queue name and Reader name. None will
+       thread_num(int): the thread num to read files by cpp reader.
+       batch_size(int): batch size of data.
+       file_list(list(str)): List of file names that need to read.
+       slots(list(int64)): list of slot id.
+       name(string): The prefix Python queue name and Reader name. None will
             be generated automatically.
 
     Returns:
@@ -80,7 +100,15 @@ def ctr_reader(feed_data,
 
     Examples:
 
-        1. The basic usage of :code:`py_reader` is as follows:
+        1. The basic usage of :code:`ctr_reader` is as follows:
+
+     .. code-block:: python
+
+        py_reader = fluid.contrib.ctr_reader.ctr_reader(
+          feed_dict=datas, file_type='plain', file_format='csv',
+          file_list=file_list, dense_slot_indexs=[1, 2, 3, 4], sparse_slot_indexs=[],
+          capacity=64, thread_num=20, batch_size=1000, slots=[], name='ctr_reader')
+
     """
     if name is None:
         queue_name = unique_name('lod_tensor_blocking_queue')
@@ -90,7 +118,7 @@ def ctr_reader(feed_data,
         reader_name = "_".join([name, "reader"])
 
     var = global_scope().var(queue_name)
-    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity)
 
     startup_blk = default_startup_program().current_block()
     reader_var = startup_blk.create_var(name=reader_name)
@@ -99,12 +127,22 @@ def ctr_reader(feed_data,
         inputs={'blocking_queue': [queue_name]},
         outputs={'Out': [reader_var]},
         attrs={
+            'use_data_config': False,
             'thread_num': thread_num,
             'batch_size': batch_size,
             'file_list': file_list,
-            'slots': slots,
+            'file_type': file_type,
+            'file_format': file_format,
+            'dense_slot_index': dense_slot_index,
+            'sparse_slot_index': sparse_slot_index,
+            'sparse_slots': slots,
+            'ranks': [],
+            'lod_levels': [],
+            'shape_concat': []
         })
 
+    dtypes = [data.dtype for data in feed_dict]
+    reader_var.desc.set_dtypes(dtypes)
     reader_var.persistable = True
 
     main_prog_reader_var = _copy_reader_var_(
@@ -118,6 +156,9 @@ def ctr_reader(feed_data,
 
     main_blk = default_main_program().current_block()
     main_blk.append_op(
-        type='read', inputs={'Reader': [reader]}, outputs={'Out': feed_data})
+        type='read',
+        inputs={'Reader': [reader]},
+        attrs={'infer_out': False},
+        outputs={'Out': feed_dict})
 
     return reader
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
index 7d6b070203..f38d978341 100644
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
@@ -11,8 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from __future__ import print_function
+import os
+import subprocess
 from ....framework import Program
+from ....framework import Block
+from .... import core
 
 __all__ = ['Graph', 'ImitationGraph', 'IRGraph']
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py
new file mode 100644
index 0000000000..6c26475f48
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import quantization_pass
+from .quantization_pass import *
+
+__all__ = quantization_pass.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
new file mode 100644
index 0000000000..266a106bc5
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -0,0 +1,318 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+from .... import core
+from ....framework import IrGraph
+from ....framework import Program
+from ....framework import Variable
+from ....initializer import Constant
+from .... import unique_name
+
+__all__ = ['QuantizationTransformPass']
+
+
+class QuantizationTransformPass(object):
+    def __init__(self,
+                 scope=None,
+                 program_exe=None,
+                 weight_bits=8,
+                 activation_bits=8,
+                 activation_quantize_type='abs_max',
+                 weight_quantize_type='abs_max',
+                 window_size=10000):
+        """
+        Convert and rewrite the IrGraph according to weight and
+        activation quantization type.
+        Args:
+            weight_bits (int): quantization bit number for weights,
+                the bias is not quantized.
+            activation_bits (int): quantization bit number for activation.
+            activation_quantize_type (str): quantization type for activation,
+                now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
+                the quantization scale will be calculated dynamically each step
+                in both training and testing period. If use 'range_abs_max',
+                a static quantization scale will be calculated during training
+                and used in inference.
+            weight_quantize_type (str): quantization type for weights,
+                support 'abs_max'. The 'range_abs_max' usually is not used for
+                weight, since weights are fixed once the model is well trained.
+            window_size (int): the window size for 'range_abs_max' quantization.
+        Examples:
+        .. code-block:: python
+            # The original graph will be rewrite.
+            import paddle.fluid as fluid
+            from paddle.fluid.contrib.slim.quantization \
+                import QuantizationTransformPass
+            from paddle.fluid.contrib.slim.graph import IrGraph
+            from paddle.fluid import core
+
+            graph = IrGraph(core.Graph(program.desc), for_test=False)
+            exe = fluid.Executor(fluid.CPUPlace())
+            transform_pass = QuantizationTransformPass(fluid.global_scope(),
+            exe)
+            transform_pass.apply(graph)
+        """
+        self._scope = scope
+        self._program_exe = program_exe
+        self._weight_bits = weight_bits
+        self._activation_bits = activation_bits
+
+        quant_type = ['abs_max', 'range_abs_max']
+        if activation_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown activation_quantize_type : '%s'. It can only be ",
+                "'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
+        if weight_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown weight_quantize_type: '%s'. It can only be ",
+                "'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
+
+        self._activation_quantize_type = activation_quantize_type
+        self._weight_quantize_type = weight_quantize_type
+        self._window_size = window_size
+
+        self._need_initialized = collections.OrderedDict()
+        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+        self._quantizable_grad_ops = [
+            '%s_grad' % (op) for op in self._quantizable_ops
+        ]
+        self._fake_quant_op_types = [
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+        ]
+        self._fake_dequant_op_types = ['fake_dequantize_max_abs']
+        self._is_test = None
+        self._global_step = None
+
+    def apply(self, graph):
+        assert isinstance(graph,
+                          IrGraph), 'graph must be the instance of IrGraph.'
+        self._need_initialized.clear()
+        self._is_test = graph.is_test()
+        # marked the variable which has been dequantized.
+        dequantized_vars = collections.OrderedDict()
+        params = [p.name() for p in graph.all_parameters()]
+
+        def _transform_forward(graph, op):
+            for var_node in op.inputs:
+                if var_node.name() in dequantized_vars:
+                    dequant_var_node = dequantized_vars[var_node.name()]
+                else:
+                    quant_bits = self._weight_bits if var_node.name() in params \
+                    else self._activation_bits
+                    quant_type = self._weight_quantize_type if var_node.name() \
+                        in params else self._activation_quantize_type
+                    quant_var_node, scale_var_node = self._insert_quant_op(
+                        graph, var_node, quant_bits, quant_type)
+                    dequant_var_node = self._insert_dequant_op(
+                        graph, quant_var_node, scale_var_node, quant_bits)
+                    dequantized_vars[var_node.name()] = dequant_var_node
+                graph.update_input_link(var_node, dequant_var_node, op)
+
+        def _transform_backward(graph, op):
+            no_dequanted_input_vars = True
+            for var_node in op.inputs:
+                if var_node.name() in dequantized_vars:
+                    dequant_var_node = dequantized_vars[var_node.name()]
+                    graph.update_input_link(var_node, dequant_var_node, op)
+                    no_dequanted_input_vars = False
+            if no_dequanted_input_vars:
+                raise ValueError("There is no dequanted inputs for op %s." %
+                                 (op.name()))
+
+        if not self._is_test:
+            self._create_global_step(graph)
+        ops = graph.all_ops()
+        # The process of _transform_forward and _transform_backward is needed in two for loops.
+        # The loop for transforming the forward graph:
+        for op in ops:
+            if op.name() in self._quantizable_ops:
+                _transform_forward(graph, op)
+        # The loop for renaming the inputs of backward op.
+        for op in ops:
+            if op.name() in self._quantizable_grad_ops:
+                _transform_backward(graph, op)
+
+        if len(self._need_initialized) > 0:
+            assert self._scope is not None, \
+            'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
+            assert self._program_exe is not None, \
+            'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.'
+            init_program = Program()
+            for var_desc, initializer in self._need_initialized.iteritems():
+                var = Variable(init_program.global_block())
+                var._set_desc(var_desc)
+                initializer(var, init_program.global_block())
+            self._program_exe.run(program=init_program, scope=self._scope)
+
+        return graph
+
+    def _create_global_step(self, graph):
+        if self._weight_quantize_type == 'range_abs_max' or \
+                self._activation_quantize_type == 'range_abs_max':
+            counter_name = '@STEP_COUNTER@'
+            for node in graph.all_vars():
+                if node.name() == counter_name:
+                    self._global_step = node
+            if self._global_step is None:
+                global_step_in = graph.create_param_node(
+                    name=counter_name,
+                    var_type=core.VarDesc.VarType.LOD_TENSOR,
+                    shape=[1],
+                    var_dtype=core.VarDesc.VarType.INT64)
+                self._need_initialized[global_step_in.var()] = \
+                    Constant(value=0, force_cpu=True)
+                global_step_out = graph.create_var_node_from_desc(
+                    global_step_in.var())
+                increment_op = graph.create_op_node(
+                    op_type='increment',
+                    attrs={'step': 1.0},
+                    inputs={'X': global_step_in},
+                    outputs={'Out': global_step_out})
+                graph.link_to(global_step_in, increment_op)
+                graph.link_to(increment_op, global_step_out)
+                self._global_step = global_step_out
+
+    def _insert_quant_op(self, graph, var_node, quant_bits, quant_type):
+        """
+        Insert fake_quantize_op in the graph.
+        """
+        if quant_type == 'abs_max':
+            return self._insert_quant_abs_max_op(graph, var_node, quant_bits)
+        elif quant_type == 'range_abs_max':
+            return self._insert_quant_range_abs_max_op(graph, var_node,
+                                                       quant_bits)
+
+    def _insert_quant_abs_max_op(self, graph, var_node, quant_bits):
+        """
+        Insert fake_quantize_abs_max op in the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+        scale_var_node = graph.create_var_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+        quant_op_node = graph.create_op_node(
+            op_type='fake_quantize_abs_max',
+            attrs={'bit_length': quant_bits},
+            inputs={'X': var_node},
+            outputs={'Out': quant_var_node,
+                     'OutScale': scale_var_node})
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        graph.link_to(quant_op_node, scale_var_node)
+        return quant_var_node, scale_var_node
+
+    def _insert_quant_range_abs_max_op(self, graph, var_node, quant_bits):
+        """
+        Insert fake_quantize_range_abs_max on the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+
+        scale_in_node = graph.create_param_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            var_dtype=var_node.var().dtype())
+        self._need_initialized[scale_in_node.var()] = Constant(value=0.001)
+
+        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
+        inputs = {'X': var_node, 'InScale': scale_in_node}
+        outputs = {'Out': quant_var_node, 'OutScale': scale_out_node}
+
+        if not self._is_test:
+            # The name of scales_var_node maybe 'scales_0', 'scales_1', etc.
+            scales_node = graph.create_param_node(
+                name=unique_name.generate('scales'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                shape=[self._window_size],
+                var_dtype=var_node.var().dtype())
+            self._need_initialized[scales_node.var()] = Constant(value=0)
+            inputs['Iter'] = self._global_step
+            outputs['OutScales'] = scales_node
+        attrs = {
+            'window_size': self._window_size,
+            'bit_length': quant_bits,
+            'is_test': self._is_test
+        }
+        quant_op_node = graph.create_op_node(
+            op_type='fake_quantize_range_abs_max',
+            attrs=attrs,
+            inputs=inputs,
+            outputs=outputs)
+
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(scale_in_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        graph.link_to(quant_op_node, scale_out_node)
+
+        if not self._is_test:
+            graph.link_to(self._global_step, quant_op_node)
+            graph.link_to(quant_op_node, scales_node)
+
+        return quant_var_node, scale_out_node
+
+    def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
+        """
+        Insert fake_dequantize_op in the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        dequant_var_node = graph.create_var_node(
+            name=self._dequantized_var_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+        max_range = (1 << (quant_bits - 1)) - 1
+        dequant_op_node = graph.create_op_node(
+            op_type='fake_dequantize_max_abs',
+            attrs={'max_range': float(max_range)},
+            inputs={'X': var_node,
+                    'Scale': scale_var_node},
+            outputs={'Out': dequant_var_node})
+        graph.link_to(var_node, dequant_op_node)
+        graph.link_to(scale_var_node, dequant_op_node)
+        graph.link_to(dequant_op_node, dequant_var_node)
+        return dequant_var_node
+
+    def _quantized_var_name(self, var_name):
+        """
+        Return quantized variable name for the input `var_name`.
+        """
+        return "%s.quantized" % (var_name)
+
+    def _dequantized_var_name(self, var_name):
+        """
+        Return dequantized variable name for the input `var_name`.
+        """
+        return "%s.dequantized" % (var_name)
+
+    def _quantized_scale_name(self, var_name):
+        """
+        Return the scale name of quantized variable for the input `var_name`.
+        """
+        return "%s.scale" % (var_name)
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
new file mode 100644
index 0000000000..1bd4b95d6b
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
@@ -0,0 +1,175 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import unittest
+import random
+import numpy as np
+import paddle.fluid as fluid
+import six
+from paddle.fluid.framework import Program
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid import core
+
+
+def linear_fc(num):
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        hidden = fluid.layers.fc(hidden, size=128, act='relu')
+    loss = fluid.layers.cross_entropy(input=hidden, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestQuantizationTransformPass(unittest.TestCase):
+    def setUp(self):
+        self.quantizable_op_and_inputs = {
+            'conv2d': ['Input', 'Filter'],
+            'depthwise_conv2d': ['Input', 'Filter'],
+            'mul': ['X', 'Y']
+        }
+        self.quantizable_grad_op_inputs = {
+            'conv2d_grad': ['Input', 'Filter'],
+            'depthwise_conv2d_grad': ['Input', 'Filter'],
+            'mul_grad': ['X', 'Y']
+        }
+
+    def check_program(self, transform_pass, program):
+        quantized_ops = set()
+        for block in program.blocks:
+            for op in block.ops:
+                # check forward
+                if op.type in self.quantizable_op_and_inputs:
+                    for arg_name in op.input_arg_names:
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        quantized_ops.add(arg_name)
+
+            for op in block.ops:
+                # check backward
+                if op.type in self.quantizable_grad_op_inputs:
+                    for pname in self.quantizable_grad_op_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        self.assertTrue(arg_name in quantized_ops)
+
+    def linear_fc_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = linear_fc(3)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        exe = fluid.Executor(fluid.CPUPlace())
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            program_exe=exe,
+            activation_quantize_type=quant_type)
+        transform_pass.apply(graph)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
+        program = graph.to_program()
+        self.check_program(transform_pass, program)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
+        val_marked_nodes = set()
+        for op in val_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                val_marked_nodes.add(op)
+        val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
+
+    def test_linear_fc_quant_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.linear_fc_quant('abs_max')
+
+    def test_linear_fc_quant_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.linear_fc_quant('range_abs_max')
+
+    def residual_block_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        exe = fluid.Executor(fluid.CPUPlace())
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            program_exe=exe,
+            activation_quantize_type=quant_type)
+        transform_pass.apply(graph)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
+        program = graph.to_program()
+        self.check_program(transform_pass, program)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
+        val_marked_nodes = set()
+        for op in val_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                val_marked_nodes.add(op)
+        val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
+
+    def test_residual_block_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.residual_block_quant('abs_max')
+
+    def test_residual_block_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.residual_block_quant('range_abs_max')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index 79bec8c4ad..81aee1233d 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -1,6 +1,10 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+if(APPLE OR WIN32 OR NOT WITH_MKL)
+    list(REMOVE_ITEM TEST_OPS test_calibration)
+endif()
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
new file mode 100644
index 0000000000..f07fefe7e0
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -0,0 +1,257 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import numpy as np
+import time
+import sys
+import random
+import paddle
+import paddle.fluid as fluid
+import argparse
+import functools
+import contextlib
+import paddle.fluid.profiler as profiler
+from paddle.dataset.common import download
+from PIL import Image, ImageEnhance
+import math
+sys.path.append('..')
+import int8_inference.utility as int8_utility
+
+random.seed(0)
+np.random.seed(0)
+
+DATA_DIM = 224
+
+THREAD = 1
+BUF_SIZE = 102400
+
+DATA_DIR = 'data/ILSVRC2012'
+
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+
+
+# TODO(guomingz): Remove duplicated code from line 45 ~ line 114
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+
+
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+
+
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+
+    img = Image.open(img_path)
+
+    img = resize_short(img, target_size=256)
+    img = crop_image(img, target_size=DATA_DIM, center=True)
+
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+
+    return img, sample[1]
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    data_dir=DATA_DIR):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                np.random.shuffle(full_lines)
+
+            lines = full_lines
+
+            for line in lines:
+                img_path, label = line.split()
+                img_path = os.path.join(data_dir, img_path)
+                if not os.path.exists(img_path):
+                    continue
+                yield img_path, int(label)
+
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+
+
+def val(data_dir=DATA_DIR):
+    file_list = os.path.join(data_dir, 'val_list.txt')
+    return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
+
+
+class TestCalibrationForResnet50(unittest.TestCase):
+    def setUp(self):
+        self.int8_download = 'int8/download'
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                               self.int8_download)
+
+        data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
+        data_md5 = '1b6c1c434172cca1bf9ba1e4d7a3157d'
+        self.data_cache_folder = self.download_data(data_url, data_md5, "data")
+
+        # reader/decorator.py requires the relative path to the data folder
+        cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data",
+                                                   self.data_cache_folder)
+        os.system(cmd)
+
+        self.iterations = 50
+
+    def cache_unzipping(self, target_folder, zip_path):
+        if not os.path.exists(target_folder):
+            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder,
+                                                          zip_path)
+            os.system(cmd)
+
+    def download_data(self, data_url, data_md5, folder_name):
+        download(data_url, self.int8_download, data_md5)
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        file_name = data_url.split('/')[-1]
+        zip_path = os.path.join(self.cache_folder, file_name)
+        self.cache_unzipping(data_cache_folder, zip_path)
+        return data_cache_folder
+
+    def download_resnet50_model(self):
+        # resnet50 fp32 data
+        data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
+        data_md5 = '4a5194524823d9b76da6e738e1367881'
+        self.model_cache_folder = self.download_data(data_url, data_md5,
+                                                     "resnet50_fp32")
+
+    def run_program(self, model_path, generate_int8=False, algo='direct'):
+        image_shape = [3, 224, 224]
+        os.environ['FLAGS_use_mkldnn'] = 'True'
+
+        fluid.memory_optimize(fluid.default_main_program())
+
+        exe = fluid.Executor(fluid.CPUPlace())
+
+        [infer_program, feed_dict,
+         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+
+        t = fluid.transpiler.InferenceTranspiler()
+        t.transpile(infer_program, fluid.CPUPlace())
+
+        val_reader = paddle.batch(val(), batch_size=1)
+
+        if generate_int8:
+            int8_model = os.path.join(os.getcwd(), "calibration_out")
+
+            if os.path.exists(int8_model):
+                os.system("rm -rf " + int8_model)
+                os.system("mkdir " + int8_model)
+
+            print("Start calibration ...")
+
+            calibrator = int8_utility.Calibrator(
+                program=infer_program,
+                pretrained_model=model_path,
+                algo=algo,
+                exe=exe,
+                output=int8_model,
+                feed_var_names=feed_dict,
+                fetch_list=fetch_targets)
+
+        test_info = []
+        cnt = 0
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(image_shape) for x in data]).astype("float32")
+            label = np.array([x[1] for x in data]).astype("int64")
+            label = label.reshape([-1, 1])
+            running_program = calibrator.sampling_program.clone(
+            ) if generate_int8 else infer_program.clone()
+            for op in running_program.current_block().ops:
+                if op.has_attr("use_mkldnn"):
+                    op._set_attr("use_mkldnn", True)
+
+            _, acc1, _ = exe.run(
+                running_program,
+                feed={feed_dict[0]: image,
+                      feed_dict[1]: label},
+                fetch_list=fetch_targets)
+            if generate_int8:
+                calibrator.sample_data()
+
+            test_info.append(np.mean(acc1) * len(data))
+            cnt += len(data)
+
+            if batch_id != self.iterations - 1:
+                continue
+
+            break
+
+        if generate_int8:
+            calibrator.save_int8_model()
+
+            print(
+                "Calibration is done and the corresponding files are generated at {}".
+                format(os.path.abspath("calibration_out")))
+        else:
+            return np.sum(test_info) / cnt
+
+    def test_calibration(self):
+        self.download_resnet50_model()
+        fp32_acc1 = self.run_program(self.model_cache_folder + "/model")
+        self.run_program(self.model_cache_folder + "/model", True)
+        int8_acc1 = self.run_program("calibration_out")
+        delta_value = np.abs(fp32_acc1 - int8_acc1)
+        self.assertLess(delta_value, 0.01)
+
+
+class TestCalibrationForMobilenetv1(TestCalibrationForResnet50):
+    def download_mobilenetv1_model(self):
+        # mobilenetv1 fp32 data
+        data_url = 'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        data_md5 = '13892b0716d26443a8cdea15b3c6438b'
+        self.model_cache_folder = self.download_data(data_url, data_md5,
+                                                     "mobilenetv1_fp32")
+
+    def test_calibration(self):
+        self.download_mobilenetv1_model()
+        fp32_acc1 = self.run_program(self.model_cache_folder + "/model")
+        self.run_program(self.model_cache_folder + "/model", True, algo='KL')
+        int8_acc1 = self.run_program("calibration_out")
+        delta_value = np.abs(fp32_acc1 - int8_acc1)
+        self.assertLess(delta_value, 0.01)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 7b70d19de5..a24e1d1300 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -88,8 +88,8 @@ class DataToLoDTensorConverter(object):
                     raise ValueError(
                         "Reshape error. What is defined in data layer is {}, but receive {}"
                         .format(self.shape, arr.shape))
-            else:
-                self._check_shape(arr.shape)
+            #else:
+            #    self._check_shape(arr.shape)
         t = core.LoDTensor()
         t.set(arr, self.place)
         if self.lod_level > 0:
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 0d06d0f2c9..d3ff14a179 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -27,6 +27,8 @@ from .. import compat as cpt
 __all__ = ['Executor', 'global_scope', 'scope_guard']
 
 g_scope = core.Scope()
+InferNativeConfig = core.NativeConfig
+InferAnalysisConfig = core.AnalysisConfig
 
 
 def global_scope():
@@ -303,7 +305,9 @@ class Executor(object):
     def __init__(self, place):
         self.place = place
         self.program_caches = dict()
-        self.executor = None
+        p = core.Place()
+        p.set_place(self.place)
+        self._default_executor = core.Executor(p)
         self._closed = False
 
     def _get_program_cache(self, program_cache_key):
@@ -395,12 +399,13 @@ class Executor(object):
             >>> ...
             >>> exe.close()
         """
-        if not self._closed and self.executor:
-            self.executor.close()
+        if not self._closed:
+            self._default_executor.close()
             self._closed = True
 
     def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                       return_numpy):
+        exe = program._executor
         if isinstance(feed, dict):
             feed_tensor_dict = dict()
             for feed_name in feed:
@@ -412,8 +417,7 @@ class Executor(object):
                     feed_tensor.set(feed[feed_name], core.CPUPlace())
                 feed_tensor_dict[feed_name] = feed_tensor
 
-            self.executor.feed_and_split_tensor_into_local_scopes(
-                feed_tensor_dict)
+            exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict)
         elif isinstance(feed, list) or isinstance(feed, tuple):
             if len(feed) != len(program._places):
                 raise ValueError(
@@ -434,10 +438,10 @@ class Executor(object):
                         tensor = tmp
                     res_dict[feed_name] = tensor
                 res.append(res_dict)
-            self.executor.feed_tensors_into_local_scopes(res)
+            exe.feed_tensors_into_local_scopes(res)
 
         fetch_var_names = list(map(_to_name_str, fetch_list))
-        self.executor.run(fetch_var_names, fetch_var_name)
+        exe.run(fetch_var_names, fetch_var_name)
         arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
 
         if return_numpy:
@@ -509,12 +513,9 @@ class Executor(object):
         compiled = isinstance(program, compiler.CompiledProgram)
         # For backward compatibility, run directly.
         if not compiled:
-            if not self.executor:
-                p = core.Place()
-                p.set_place(self.place)
-                self.executor = core.Executor(p)
             return self._run(
                 program,
+                self._default_executor,
                 feed=feed,
                 fetch_list=fetch_list,
                 feed_var_name=feed_var_name,
@@ -524,7 +525,6 @@ class Executor(object):
                 use_program_cache=use_program_cache)
 
         program._compile(scope, self.place)
-        self.executor = program._executor
         if program._is_data_parallel:
             return self._run_parallel(
                 program,
@@ -533,11 +533,14 @@ class Executor(object):
                 fetch_list=fetch_list,
                 fetch_var_name=fetch_var_name,
                 return_numpy=return_numpy)
+        elif program._is_inference:
+            return self._run_inference(program._executor, feed)
         else:
             # TODO(panyx0718): Can compile program to optimize executor
             # performance.
             return self._run(
                 program._program,
+                self._default_executor,
                 feed=feed,
                 fetch_list=fetch_list,
                 feed_var_name=feed_var_name,
@@ -546,8 +549,8 @@ class Executor(object):
                 return_numpy=return_numpy,
                 use_program_cache=use_program_cache)
 
-    def _run(self, program, feed, fetch_list, feed_var_name, fetch_var_name,
-             scope, return_numpy, use_program_cache):
+    def _run(self, program, exe, feed, fetch_list, feed_var_name,
+             fetch_var_name, scope, return_numpy, use_program_cache):
 
         if feed is None:
             feed = {}
@@ -585,8 +588,11 @@ class Executor(object):
                 fetch_var_name=fetch_var_name)
 
         self._feed_data(program, feed, feed_var_name, scope)
-        self.executor.run(program.desc, scope, 0, True, True)
+        exe.run(program.desc, scope, 0, True, True)
         outs = self._fetch_data(fetch_list, fetch_var_name, scope)
         if return_numpy:
             outs = as_numpy(outs)
         return outs
+
+    def _run_inference(self, exe, feed):
+        return exe.run(feed)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 569ca2a4f7..2bdae60db3 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -23,6 +23,7 @@ import traceback
 import six
 
 import numpy as np
+import subprocess
 
 from .. import compat as cpt
 from .proto import framework_pb2
@@ -37,11 +38,13 @@ try:
     from . import core
 except ImportError as e:
     if os.name == 'nt':
+        executable_path = os.path.abspath(os.path.dirname(sys.executable))
         raise ImportError(
-            """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\"
-        if you encounters \"mkldnn.dll not found\" errors. If you have python
-        installed in other directory, replace \"c:\python27\lib" with your own
-        directory. The original error is: \n""" + cpt.get_exception_message(e))
+            """NOTE: You may need to run \"set PATH=%s;%%PATH%%\"
+        if you encounters \"DLL load failed\" errors. If you have python
+        installed in other directory, replace \"%s\" with your own
+        directory. The original error is: \n %s""" %
+            (executable_path, executable_path, cpt.get_exception_message(e)))
     else:
         raise ImportError(
             """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
@@ -67,6 +70,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
 _imperative_tracer_ = None
+_imperative_current_expected_place_ = None
 
 
 def _in_imperative_mode():
@@ -77,6 +81,10 @@ def _imperative_tracer():
     return _imperative_tracer_
 
 
+def _current_expected_place():
+    return _imperative_current_expected_place_
+
+
 class NameScope(object):
     def __init__(self, name="", parent=None):
         self._children = dict()
@@ -380,8 +388,8 @@ class Variable(object):
             self._ivar.stop_gradient = stop_gradient
 
     def _numpy(self):
-        tensor = self._ivar.value().get_tensor()
-        return np.array(tensor)
+        new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
+        return np.array(new_ivar.value().get_tensor())
 
     def _backward(self):
         self._ivar._run_backward()
@@ -1308,6 +1316,7 @@ class Block(object):
     def _trace_op(self, op, stop_gradient=False):
         if _in_imperative_mode():
             _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc,
+                                       _imperative_current_expected_place_,
                                        stop_gradient)
 
     def _insert_op(self, index, *args, **kwargs):
@@ -1510,6 +1519,154 @@ class Block(object):
         return ret_var
 
 
+class IrGraph(object):
+    """
+    IrGraph uses core.Graph as the delegation to accomplish the manipulation.
+    """
+
+    def __init__(self, graph, for_test=False):
+        """
+        Construct the IrGraph using core.Graph.
+        Args:
+            graph(core.Graph): C++ Graph.
+            for_test(bool): True for the test graph and false for the train graph.
+        """
+        assert isinstance(
+            graph, core.Graph), 'graph must be the instance of core.Graph.'
+        self.graph = graph
+        self._for_test = for_test
+
+    def is_test(self):
+        return self._for_test
+
+    def all_parameters(self):
+        param_nodes = set()
+        for node in self.graph.nodes():
+            if node.is_var() and node.var() is not None and node.var(
+            ).persistable():
+                param_nodes.add(node)
+        return param_nodes
+
+    def all_vars(self):
+        return {node for node in self.graph.nodes() if node.is_var()}
+
+    def all_ops(self):
+        return {node for node in self.graph.nodes() if node.is_op()}
+
+    def create_param_node(self, name, var_type, shape, var_dtype):
+        var_desc = core.VarDesc(name)
+        var_desc.set_type(var_type)
+        var_desc.set_shape(shape)
+        var_desc.set_dtype(var_dtype)
+        var_desc.set_persistable(True)
+        return self.graph.create_var_node(var_desc)
+
+    def create_var_node(self, name, var_type, shape, var_dtype):
+        var_desc = core.VarDesc(name)
+        var_desc.set_type(var_type)
+        var_desc.set_shape(shape)
+        var_desc.set_dtype(var_dtype)
+        return self.graph.create_var_node(var_desc)
+
+    def create_var_node_from_desc(self, var_desc):
+        return self.graph.create_var_node(var_desc)
+
+    def create_op_node(self, op_type, attrs, inputs, outputs):
+        op_desc = core.OpDesc()
+        op_desc.set_type(op_type)
+        for attr, value in attrs.iteritems():
+            self._update_desc_attr(op_desc, attr, value)
+        for input_name, var_nodes in inputs.iteritems():
+            if not isinstance(var_nodes, list):
+                var_nodes = [var_nodes]
+            op_desc.set_input(input_name,
+                              [var_node.name() for var_node in var_nodes])
+        for output_name, var_nodes in outputs.iteritems():
+            if not isinstance(var_nodes, list):
+                var_nodes = [var_nodes]
+            op_desc.set_output(output_name,
+                               [var_node.name() for var_node in var_nodes])
+        return self.graph.create_op_node(op_desc)
+
+    def create_op_node_from_desc(self, op_desc):
+        return self.graph.create_op_node(op_desc)
+
+    def update_input_link(self, old_input_node, new_input_node, op_node):
+        assert old_input_node in self.graph.nodes() and new_input_node in self.graph.nodes() and \
+            op_node in self.graph.nodes(), 'Th three arguments must be in the graph nodes.'
+        old_input_node.outputs_remove(op_node)
+        op_node.inputs_remove(old_input_node)
+        new_input_node.outputs_append(op_node)
+        op_node.inputs_append(new_input_node)
+        op_node.op()._rename_input(old_input_node.name(), new_input_node.name())
+
+    def link_to(self, node_in, node_out):
+        assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \
+            'Th two arguments must be in the graph nodes.'
+        node_in.outputs_append(node_out)
+        node_out.inputs_append(node_in)
+
+    def safe_remove_nodes(self, remove_nodes):
+        if not isinstance(remove_nodes, set):
+            remove_nodes = set(remove_nodes)
+        core.graph_safe_remove_nodes(self.graph, remove_nodes)
+
+    def draw(self, save_path, name, marked_nodes=None):
+        def _convert_to_pdf(dot_file_path):
+            pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
+            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
+                            + ' -o ' + pdf_save_path, shell=True)
+            if exited_code != 0:
+                print('The dot command is needed for creating pdf files.')
+                print('The {} is saved as the dot filetype.'.format(
+                    dot_file_path))
+
+        remove_ctr_vars = set()
+        ops_num = 0
+        for node in self.graph.nodes():
+            if node.is_ctrl_var():
+                remove_ctr_vars.add(node)
+            elif node.is_op():
+                ops_num += 1
+        print('Total ops num = {}.'.format(ops_num))
+        self.safe_remove_nodes(remove_ctr_vars)
+        if marked_nodes is not None:
+            if not isinstance(marked_nodes, set):
+                marked_nodes = set(marked_nodes)
+            marked_nodes = marked_nodes - remove_ctr_vars
+            if self.graph.has('__graphviz__marked_node__'):
+                self.graph.erase('__graphviz__marked_node__')
+            self.graph.set('__graphviz__marked_node__', marked_nodes)
+        viz_dot_path = os.path.join(save_path, name) + '.dot'
+        viz_pass = core.get_pass('graph_viz_pass')
+        viz_pass.set('graph_viz_path', viz_dot_path)
+        viz_pass.apply(self.graph)
+        _convert_to_pdf(viz_dot_path)
+
+    def to_program(self):
+        convert_pass = core.get_pass('graph_to_program_pass')
+        convert_pass.set('program', Program().desc)
+        convert_pass.apply(self.graph)
+        desc = convert_pass.get_program('program')
+        program = Program._construct_from_desc(desc)
+        return program
+
+    def _update_desc_attr(self, desc, name, val):
+        """
+        Update the value of desc's attribute by attribute's name.
+        """
+        if isinstance(val, Block):
+            desc.set_block_attr(name, val.desc)
+        elif isinstance(val, list) and val and all(
+                isinstance(v, Block) for v in val):
+            desc.set_blocks_attr(name, [v.desc for v in val])
+        elif isinstance(val, core.BlockDesc) or \
+                isinstance(val, core.ProgramDesc):
+            desc.set_serialized_attr(name, val.serialize_to_string())
+        else:
+            desc._set_attr(name, val)
+
+
 class Program(object):
     """
     Python Program. Beneath it is a ProgramDesc, which is used for
@@ -1545,12 +1702,20 @@ class Program(object):
         self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
         self._op_role_var = []
 
-        # for distribute
+        # for distribute training
+        # _is_distributed = True if under distributed training
         self._is_distributed = False
+        # _is_chief = True if the trainer is the first one, usually No.0
         self._is_chief = False
-        self._slice_vars_and_attrs = []
+        # _parameters_on_pservers records all the parameters distributed on parameter servers.
+        self._parameters_on_pservers = None
+        # _endpoints is a list about parameter servers ip:port, such as ["ip:port","ip:port"]
         self._endpoints = []
+        # if current role is parameter server, the _ps_endpoint is its "ip:port"
+        self._ps_endpoint = None
+        # trainers_endpoints, it is used for distribution.
         self._trainers_endpoints = []
+        # the distributed lookup table names
         self._distributed_lookup_table = None
 
     @property
@@ -1934,6 +2099,23 @@ class Program(object):
         p._sync_with_cpp()
         return p
 
+    @staticmethod
+    def _construct_from_desc(desc):
+        """
+        Construct a program from program desc.
+
+        Args:
+            desc(core.ProgramDesc): The program desc for constructing.
+
+        Returns:
+            Program: A program.
+        """
+        p = Program()
+        p.desc = desc
+        p.blocks = [Block(p, i) for i in six.moves.range(p.desc.num_blocks())]
+        p._sync_with_cpp()
+        return p
+
     @property
     def random_seed(self):
         """
@@ -2064,8 +2246,9 @@ class Program(object):
                             "Program")
         self._is_distributed = other._is_distributed
         self._is_chief = other._is_chief
-        self._slice_vars_and_attrs = other._slice_vars_and_attrs
+        self._parameters_on_pservers = other._parameters_on_pservers
         self._endpoints = other._endpoints
+        self._ps_endpoint = other._ps_endpoint
         self._distributed_lookup_table = other._distributed_lookup_table
 
     def _copy_data_info_from(self, other):
@@ -2325,5 +2508,18 @@ def _imperative_guard(tracer):
     global _imperative_tracer_
     tmp_trace = _imperative_tracer_
     _imperative_tracer_ = tracer
+
     yield
+
     _imperative_tracer_ = tmp_trace
+
+
+@contextlib.contextmanager
+def _imperative_place_guard(place):
+    global _imperative_current_expected_place_
+    tmp_place = _imperative_current_expected_place_
+    _imperative_current_expected_place_ = place
+
+    yield
+
+    _imperative_current_expected_place_ = tmp_place
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index 5d3ebb25a9..ff3984b11f 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -25,18 +25,28 @@ def enabled():
 
 
 @contextlib.contextmanager
-def guard():
+def guard(place=None):
     train = framework.Program()
     startup = framework.Program()
     tracer = core.Tracer(train.current_block().desc)
+
+    if place is None:
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
             with framework._imperative_guard(tracer):
-                yield
+                with framework._imperative_place_guard(place):
+                    yield
 
 
 def to_variable(value, block=None):
     if isinstance(value, np.ndarray):
+        assert enabled(), "to_variable could only be called in imperative mode"
+
         if not block:
             block = framework.default_main_program().current_block()
         py_var = framework.Variable(
@@ -47,9 +57,7 @@ def to_variable(value, block=None):
             dtype=value.dtype)
         var = py_var._ivar.value()
         tensor = var.get_tensor()
-        tensor.set(value, core.CPUPlace())
+        tensor.set(value, framework._current_expected_place())
         return py_var
     elif isinstance(value, framework.Variable):
         return value
-    else:
-        raise ValueError("Unsupported type %s" % type(value))
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 03fbfe76d1..140c0ff037 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -27,6 +27,7 @@ __all__ = [
     'Conv2D',
     'Pool2D',
     'FC',
+    'BatchNorm',
 ]
 
 
@@ -55,7 +56,8 @@ class Conv2D(layers.Layer):
             param_attr=param_attr,
             bias_attr=bias_attr,
             dtype=dtype,
-            name=name)
+            name=name,
+            act=act)
 
         self._groups = groups
         self._stride = utils.convert_to_list(stride, 2, 'stride')
@@ -141,6 +143,7 @@ class Conv2D(layers.Layer):
             outputs={'Out': [pre_act]},
             attrs={'axis': 1})
 
+        # Currently, we don't support inplace in imperative mode
         return self._helper.append_activation(pre_act)
 
 
@@ -216,6 +219,7 @@ class FC(layers.Layer):
                  act=None,
                  name=None):
         super(FC, self).__init__()
+
         self._size = size
         self._num_flatten_dims = num_flatten_dims
         self._dtype = dtype
@@ -241,6 +245,16 @@ class FC(layers.Layer):
             dtype=self._dtype,
             is_bias=False)
 
+        if self._helper.bias_attr:
+            size = list([self._size])
+            self._b = self._helper.create_parameter(
+                attr=self._helper.bias_attr,
+                shape=size,
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            self._b = None
+
     def forward(self, input):
         tmp = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(
@@ -253,28 +267,155 @@ class FC(layers.Layer):
                 "y_num_col_dims": 1
             })
 
-        out = self._helper.create_variable_for_type_inference(self._dtype)
+        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(
             type="sum",
             inputs={"X": [tmp]},
-            outputs={"Out": out},
+            outputs={"Out": pre_bias},
             attrs={"use_mkldnn": False})
 
-        bias_attr = self._helper.bias_attr
-        if bias_attr:
-            # add bias
-            size = list(out.shape[1:])
-            if not self._built:
-                self._b = self._helper.create_parameter(
-                    attr=bias_attr, shape=size, dtype=out.dtype, is_bias=True)
-            bias_out = self._helper.create_variable_for_type_inference(
-                dtype=out.dtype)
+        if self._b:
+            pre_activation = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
             self._helper.append_op(
                 type='elementwise_add',
-                inputs={'X': [out],
+                inputs={'X': [pre_bias],
                         'Y': [self._b]},
-                outputs={'Out': [bias_out]},
-                attrs={'axis': 1})
-            out = bias_out
-        # add activation
-        return self._helper.append_activation(out)
+                outputs={'Out': [pre_activation]},
+                attrs={'axis': self._num_flatten_dims})
+        else:
+            pre_activation = pre_bias
+        # Currently, we don't support inplace in imperative mode
+        return self._helper.append_activation(pre_activation)
+
+
+class BatchNorm(layers.Layer):
+    def __init__(self,
+                 num_channels,
+                 act=None,
+                 is_test=False,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype=core.VarDesc.VarType.FP32,
+                 data_layout='NCHW',
+                 in_place=False,
+                 name=None,
+                 moving_mean_name=None,
+                 moving_variance_name=None,
+                 do_model_average_for_mean_and_var=False,
+                 fuse_with_relu=False,
+                 use_global_stats=False):
+        super(BatchNorm, self).__init__()
+
+        assert bias_attr is not False, "bias_attr should not be False in batch_norm."
+
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(
+            'batch_norm',
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            name=name,
+            act=act)
+
+        if dtype == core.VarDesc.VarType.FP16:
+            self._dtype = core.VarDesc.VarType.FP32
+        else:
+            self._dtype = dtype
+
+        param_shape = [num_channels]
+
+        # create parameter
+        self._scale = self._helper.create_parameter(
+            attr=self._helper.param_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            default_initializer=Constant(1.0))
+
+        # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph
+        #  # setting stop_gradient=True to reduce computation
+        #  if use_global_stats and self._helper.param_attr.learning_rate == 0.:
+        #  self._scale.stop_gradient = True
+
+        self._bias = self._helper.create_parameter(
+            attr=self._helper.bias_attr,
+            shape=param_shape,
+            dtype=self._dtype,
+            is_bias=True)
+        # TODO(minqiyang): change stop_gradient sign to trainable to align with static graph
+        #  # setting stop_gradient=True to reduce computation
+        #  if use_global_stats and self._helper.bias_attr.learning_rate == 0.:
+        #  self._bias.stop_gradient = True
+
+        self._mean = self._helper.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=do_model_average_for_mean_and_var),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self._helper.create_parameter(
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=do_model_average_for_mean_and_var),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+        self._in_place = in_place
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._is_test = is_test
+        self._fuse_with_relu = fuse_with_relu
+        self._use_global_stats = use_global_stats
+
+    def _build_once(self, input):
+        pass
+
+    def forward(self, input):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        self._helper.append_op(
+            type="batch_norm",
+            inputs={
+                "X": input,
+                "Scale": self._scale,
+                "Bias": self._bias,
+                "Mean": self._mean,
+                "Variance": self._variance
+            },
+            outputs={
+                "Y": batch_norm_out,
+                "MeanOut": mean_out,
+                "VarianceOut": variance_out,
+                "SavedMean": saved_mean,
+                "SavedVariance": saved_variance
+            },
+            attrs={
+                "momentum": self._momentum,
+                "epsilon": self._epsilon,
+                "is_test": self._is_test,
+                "use_mkldnn": False,
+                "fuse_with_relu": self._fuse_with_relu,
+                "use_global_stats": self._use_global_stats
+            })
+
+        # Currently, we don't support inplace in imperative mode
+        return self._helper.append_activation(batch_norm_out)
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 8a2cd4a929..4f434328e4 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -24,7 +24,8 @@ __all__ = [
     'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
     'MSRA', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
     'UniformInitializer', 'NormalInitializer', 'TruncatedNormalInitializer',
-    'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer'
+    'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer',
+    'NumpyArrayInitializer'
 ]
 
 _force_init_on_cpu_ = False
@@ -683,6 +684,64 @@ class BilinearInitializer(Initializer):
         return op
 
 
+class NumpyArrayInitializer(Initializer):
+    """Init an parameter with an numpy array
+
+    Args:
+        value (numpy): numpy array to initialize the variable
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.NumpyArrayInitializer(numpy.array([1,2])))
+    """
+
+    def __init__(self, value):
+        import numpy
+        assert isinstance(value, numpy.ndarray)
+        super(NumpyArrayInitializer, self).__init__()
+        self._value = value
+
+    def __call__(self, var, block):
+        """Add constant initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        # Initialization Ops should be prepended and not appended
+        dtype = framework.convert_np_dtype_to_dtype_(self._value.dtype)
+        if dtype == VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in self._value.flat]
+        elif dtype == VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in self._value.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", self._value.dtype)
+        if self._value.size > 1024 * 1024 * 5:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+        op = block._prepend_op(
+            type='assign_value',
+            outputs={'Out': var},
+            attrs={
+                'dtype': dtype,
+                'shape': list(self._value.shape),
+                value_name: values
+            },
+            stop_gradient=True)
+        var.op = op
+        return op
+
+
 # We short the class name, since users will use the initializer with the package
 # name. The sample code:
 #
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index e74a87fc68..6b1d4cc34f 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -19,6 +19,7 @@ import errno
 import time
 import shutil
 import six
+from functools import reduce
 
 from paddle.fluid.executor import Executor
 from paddle.fluid.evaluator import Evaluator
@@ -183,8 +184,6 @@ def save_vars(executor,
             # NOTE: don't save the variable which type is RAW
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
-            if each_var.name == main_program._distributed_lookup_table:
-                continue
             new_var = _clone_var_in_block_(save_block, each_var)
             if filename is None:
                 save_block.append_op(
@@ -206,16 +205,6 @@ def save_vars(executor,
                 outputs={},
                 attrs={'file_path': os.path.join(dirname, filename)})
 
-        # if there is lookup table, the trainer 0 will notify all pserver to save.
-        if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
-            lookup_table_filename = os.path.join(dirname, "__lookup_table__")
-            attrs = {}
-            attrs['epmap'] = main_program._endpoints
-            attrs['dir'] = lookup_table_filename
-            attrs['lookup_table'] = main_program._distributed_lookup_table
-            save_block.append_op(
-                type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-
         executor.run(save_program)
 
 
@@ -267,6 +256,186 @@ def save_params(executor, dirname, main_program=None, filename=None):
         filename=filename)
 
 
+def _save_distributed_persistables(executor, dirname, main_program):
+    """
+    save_persistables for distributed training.
+    the method will do things listed below:
+    1.save part of persistable variables on trainer.
+    2.receive "remote prefetch variables" from parameter servers and merge them.
+    3.save "distributed lookup table" on parameter servers.
+    4.receive "optimizer variables" from parameter servers and merge them.
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The saving directory path.
+        main_program(Program): The program whose parameters will be
+                            saved. the main_program must be the trainer_program
+                            get after transpiler.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            t = distribute_transpiler.DistributeTranspiler()
+            t.transpile(...)
+            train_program = t.get_trainer_program()
+            _save_distributed_persistables(executor=exe, dirname=param_path, main_program=train_program)
+    """
+
+    def __save_remote_params(executor, dirname, remote_params_map):
+        """
+        recive params on pserver through rpc.
+        if the params are be sliced, will concat them to one, then save it.
+        """
+        if not remote_params_map:
+            return
+
+        prog = Program()
+        block = prog.global_block()
+
+        # recv optimize vars from pserver
+        for name, remote_params in remote_params_map.items():
+            origin_var = None
+            is_slice = False
+            slice_vars = [0] * len(remote_params)
+            slice_var_names = [""] * len(remote_params)
+            endpoints = [""] * len(remote_params)
+
+            for idx, optimizer in enumerate(remote_params):
+                origin = optimizer.origin
+                slice = optimizer.slice
+                is_slice = optimizer.is_slice
+                block_id = optimizer.block_id
+                endpoint = optimizer.endpoint
+
+                if idx == 0:
+                    origin_var = block.create_var(
+                        name=origin.name,
+                        type=origin.type,
+                        shape=origin.shape,
+                        dtype=origin.dtype,
+                        persistable=True)
+
+                slice_var = block.create_var(
+                    name="{}.slice.{}".format(slice.name, idx),
+                    type=slice.type,
+                    shape=slice.shape,
+                    dtype=slice.dtype,
+                    persistable=True)
+
+                index = block_id if is_slice else idx
+                slice_vars[index] = slice_var
+                slice_var_names[index] = slice.name
+                endpoints[index] = endpoint
+
+            if is_slice:
+                block.append_op(
+                    type='recv',
+                    inputs={"X": []},
+                    outputs={"Out": slice_vars},
+                    attrs={
+                        "epmap": endpoints,
+                        "with_barrier": False,
+                        "varnames": slice_var_names,
+                        "sync_mode": True
+                    })
+                block.append_op(
+                    type='concat',
+                    inputs={'X': slice_vars},
+                    outputs={'Out': origin_var},
+                    attrs={})
+            else:
+                block.append_op(
+                    type='recv',
+                    inputs={"X": []},
+                    outputs={"Out": [origin_var]},
+                    attrs={
+                        "epmap": endpoints[:1],
+                        "with_barrier": False,
+                        "varnames": slice_var_names,
+                        "sync_mode": True
+                    })
+            block.append_op(
+                type='save',
+                inputs={'X': [origin_var]},
+                outputs={},
+                attrs={'file_path': os.path.join(dirname, origin_var.name)})
+            block.append_op(type='delete_var', inputs={'X': slice_vars})
+        executor.run(prog)
+
+    def __save_distributed_lookup_tables(executor, dirname,
+                                         distributed_lookup_table, endpoints):
+        """
+        because the distributed lookup table may too huge to merge and save at one place,
+        it will be saved at parameter server independent respectively.
+
+        the save directory is dirname/"__lookup_table__".
+
+        """
+        prog = Program()
+        block = prog.global_block()
+
+        # if there is lookup table, the trainer 0 will notify all pserver to save.
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        attrs = {}
+        attrs['epmap'] = endpoints
+        attrs['dir'] = lookup_table_filename
+        attrs['lookup_table'] = distributed_lookup_table
+        block.append_op(
+            type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+        executor.run(prog)
+
+    def __exclude_vars(exclude_var_names=[]):
+        def is_valid(var):
+            if var.name in exclude_var_names:
+                return False
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                        var.desc.type() == core.VarDesc.VarType.READER:
+                return False
+            return var.persistable
+
+        return is_valid
+
+    if not isinstance(main_program, Program):
+        raise ValueError("'main_program' should be an instance of Program.")
+
+    if not main_program._is_distributed:
+        raise ValueError(
+            "'_save_distributed_persistables' just be designed for distributed training."
+        )
+
+    remote_params_map = main_program._parameters_on_pservers.get_distributed_vars_by_vtypes(
+        ["Optimizer", "RemotePrefetch"], groupby=True)
+
+    exclude_var_names = []
+    if remote_params_map:
+        exclude_var_names.extend(remote_params_map.keys())
+
+    if main_program._distributed_lookup_table:
+        if isinstance(main_program._distributed_lookup_table, list):
+            exclude_var_names.extend(main_program._distributed_lookup_table)
+        else:
+            exclude_var_names.append(main_program._distributed_lookup_table)
+
+    local_vars = list(
+        filter(__exclude_vars(exclude_var_names), main_program.list_vars()))
+    save_vars(
+        executor, main_program=main_program, dirname=dirname, vars=local_vars)
+
+    if main_program._is_chief:
+        if remote_params_map:
+            __save_remote_params(executor, dirname, remote_params_map)
+        if main_program._distributed_lookup_table:
+            __save_distributed_lookup_tables(
+                executor, dirname, main_program._distributed_lookup_table,
+                main_program._endpoints)
+
+
 def save_persistables(executor, dirname, main_program=None, filename=None):
     """
     This function filters out all variables with `persistable==True` from the
@@ -301,13 +470,19 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
             fluid.io.save_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
-    save_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        vars=None,
-        predicate=is_persistable,
-        filename=filename)
+
+    if main_program and main_program._is_distributed:
+        _save_distributed_persistables(
+            executor, dirname=dirname, main_program=main_program)
+
+    else:
+        save_vars(
+            executor,
+            dirname=dirname,
+            main_program=main_program,
+            vars=None,
+            predicate=is_persistable,
+            filename=filename)
 
 
 def load_vars(executor,
@@ -402,17 +577,11 @@ def load_vars(executor,
         if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
-        load_slice_vars = []
-        for each_var in main_program._slice_vars_and_attrs:
-            load_slice_vars.append(each_var[2].name)
-
         load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
-            if each_var.name in load_slice_vars:
-                continue
             new_var = _clone_var_in_block_(load_block, each_var)
             if filename is None:
                 load_block.append_op(
@@ -435,10 +604,6 @@ def load_vars(executor,
                 attrs={'file_path': os.path.join(dirname, filename)})
         executor.run(load_prog)
 
-        # load slice vars on pserver, if have it.
-        _load_slice_up_vars(executor, dirname,
-                            main_program._slice_vars_and_attrs)
-
 
 def load_params(executor, dirname, main_program=None, filename=None):
     """
@@ -521,12 +686,134 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
             fluid.io.load_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
-    load_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        predicate=is_persistable,
-        filename=filename)
+
+    if main_program and main_program._is_distributed:
+        _load_distributed_persistables(
+            executor, dirname=dirname, main_program=main_program)
+    else:
+        load_vars(
+            executor,
+            dirname=dirname,
+            main_program=main_program,
+            predicate=is_persistable,
+            filename=filename)
+
+
+def _load_distributed_persistables(executor, dirname, main_program=None):
+    """
+    customized load_persistables for distributed training.
+    it should be used on parameter server,
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The load directory path.
+        main_program(Program): The program whose parameters will be
+                            loaded. the main_program must be the pserver_program
+                            get after transpiler.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            t = distribute_transpiler.DistributeTranspiler()
+            t.transpile(...)
+            pserver_prog = t.get_pserver_program(...)
+            _load_distributed_persistables(executor=exe, dirname=param_path, main_program=pserver_prog)
+    """
+
+    def __is_distributed_part_var(varname):
+        trainer_idx = varname.find(".trainer_")
+        block_idx = varname.find(".block")
+        return trainer_idx or block_idx
+
+    def __load_persistable_vars(executor, dirname, need_load_vars):
+        load_prog = Program()
+        load_block = load_prog.global_block()
+        need_delete_vars = []
+
+        for param in need_load_vars:
+            origin_var = param.origin
+            slice_var = param.slice
+            is_slice = param.is_slice
+            offset = param.offset
+
+            if is_slice:
+                origin = load_block.create_var(
+                    name="{}.load".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+                slice = load_block.create_var(
+                    name=slice_var.name,
+                    type=slice_var.type,
+                    shape=slice_var.shape,
+                    dtype=slice_var.dtype,
+                    persistable=True)
+
+                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+                start = int(offset / dim1_flatten)
+                end = int(offset / dim1_flatten + slice.shape[0])
+
+                load_block.append_op(
+                    type="slice",
+                    inputs={'Input': origin},
+                    outputs={'Out': slice},
+                    attrs={'axes': [0],
+                           'starts': [start],
+                           'ends': [end]})
+
+                need_delete_vars.append(origin)
+            else:
+                origin = load_block.create_var(
+                    name="{}".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+        load_block.append_op(
+            type='delete_var',
+            inputs={'X': need_delete_vars}, )
+
+        executor.run(load_prog)
+
+    if not isinstance(main_program, Program):
+        raise ValueError("'main_program' should be an instance of Program.")
+
+    if not main_program._is_distributed:
+        raise ValueError(
+            "'_load_distributed_persistables' just be designed for distributed training."
+        )
+
+    if not main_program._ps_endpoint:
+        raise ValueError(
+            "'_load_distributed_persistables' need current_endpoint set in DistributeTranspiler.transpile"
+        )
+
+    need_load_vars = main_program._parameters_on_pservers.get_distributed_vars_by_ep(
+        main_program._ps_endpoint)
+    __load_persistable_vars(executor, dirname, need_load_vars)
 
 
 def prepend_feed_ops(inference_program,
@@ -795,52 +1082,6 @@ def load_inference_model(dirname,
     return [program, feed_target_names, fetch_targets]
 
 
-def _save_lookup_tables_by_notify(executor, dirname, lookup_table,
-                                  pserver_endpoints):
-    """
-    This function will send checkpoint notify message from Trainer 0
-    to all the pservers.
-    The checkpoint notify message contains lookup table name,
-    the absolute path on pserver to save lookup_table.
-
-    Args:
-        executor(Executor): The executor to run for send checkpoint notify.
-        dirname(str): The folder where to save.
-        lookup_table(string): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name
-        ps_endpoint_list(list): the parameter server ip:port list.
-            when use distribute lookup table, we can get ps_endpoint_list by
-            distribute arguments.
-    Return:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            _save_pserver_vars_by_notify(executor=exe,
-                    dirname=param_path, lookup_table=table_name,
-                    pserver_endpoints=ps_endpoints)
-    """
-
-    pserver_notify_program = Program()
-    pserver_notify_block = pserver_notify_program.global_block()
-
-    attrs = {}
-    attrs['epmap'] = pserver_endpoints
-    attrs['dir'] = dirname
-    attrs['lookup_table'] = lookup_table
-
-    pserver_notify_block.append_op(
-        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-    executor.run(pserver_notify_program)
-
-
 def _endpoints_replacement(program, endpoints):
     ENDPOINT_MAP = "epmap"
     for op in program.global_block().ops:
@@ -911,54 +1152,3 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
-
-
-def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
-    if not slice_vars_and_attrs:
-        return
-
-    load_prog = Program()
-    load_block = load_prog.global_block()
-    need_delete_vars = []
-
-    for var_tuple in slice_vars_and_attrs:
-        orig_var = var_tuple[0]
-        start = var_tuple[1]
-        slice_var = var_tuple[2]
-        end = start + slice_var.shape[0]
-
-        orig_var_name = orig_var.name
-        orig_var.name = "{}.origin".format(orig_var_name)
-
-        clone_orig_var = load_block.create_var(
-            name=orig_var.name,
-            type=orig_var.type,
-            shape=orig_var.shape,
-            dtype=orig_var.dtype,
-            persistable=True)
-
-        clone_slice_var = load_block.create_var(
-            name=slice_var.name,
-            type=slice_var.type,
-            shape=slice_var.shape,
-            dtype=slice_var.dtype,
-            persistable=True)
-
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [clone_orig_var]},
-            attrs={'file_path': os.path.join(dirname, orig_var_name)})
-        load_block.append_op(
-            type="slice",
-            inputs={'Input': clone_orig_var},
-            outputs={'Out': clone_slice_var},
-            attrs={'axes': [0],
-                   'starts': [start],
-                   'ends': [end]})
-        need_delete_vars.append(clone_orig_var)
-
-    load_block.append_op(
-        type='delete_var',
-        inputs={'X': need_delete_vars}, )
-    executor.run(load_prog)
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index ea9953f581..972c51938f 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -435,7 +435,10 @@ class LayerHelper(object):
         act_type = act.pop('type')
         tmp = input_var
         # NOTE(dzhwinter): some activation support inplace compution.
-        if not core.IsInplace(act_type):
+        # NOTE(minqiyang): currently, we don't support inplace in imperative mode
+        if not imperative_base.enabled() and core.IsInplace(act_type):
+            tmp = input_var
+        else:
             tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
         self.append_op(
             type=act_type,
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 8aed97dc59..cddc302d52 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -44,6 +44,7 @@ __all__ = [
     'roi_perspective_transform',
     'generate_proposal_labels',
     'generate_proposals',
+    'generate_mask_labels',
     'iou_similarity',
     'box_coder',
     'polygon_box_transform',
@@ -1659,7 +1660,7 @@ def generate_proposal_labels(rpn_rois,
                              class_nums=None,
                              use_random=True):
     """
-    ** Generate proposal labels Faster-RCNN **
+    ** Generate Proposal Labels of Faster-RCNN **
     This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
     to sample foreground boxes and background boxes, and compute loss target.
 
@@ -1740,6 +1741,140 @@ def generate_proposal_labels(rpn_rois,
     return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights
 
 
+def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
+                         labels_int32, num_classes, resolution):
+    """
+    ** Generate Mask Labels for Mask-RCNN **
+
+    This operator can be, for given the RoIs and corresponding labels,
+    to sample foreground RoIs. This mask branch also has
+    a :math: `K \\times M^{2}` dimensional output targets for each foreground
+    RoI, which encodes K binary masks of resolution M x M, one for each of the
+    K classes. This mask targets are used to compute loss of mask branch.
+
+    Please note, the data format of groud-truth segmentation, assumed the
+    segmentations are as follows. The first instance has two gt objects.
+    The second instance has one gt object, this object has two gt segmentations.
+
+        .. code-block:: python
+
+            #[
+            #  [[[229.14, 370.9, 229.14, 370.9, ...]],
+            #   [[343.7, 139.85, 349.01, 138.46, ...]]], # 0-th instance
+            #  [[[500.0, 390.62, ...],[115.48, 187.86, ...]]] # 1-th instance
+            #]
+
+            batch_masks = []
+            for semgs in batch_semgs:
+                gt_masks = []
+                for semg in semgs:
+                    gt_segm = []
+                    for polys in semg:
+                        gt_segm.append(np.array(polys).reshape(-1, 2))
+                    gt_masks.append(gt_segm)
+                batch_masks.append(gt_masks)
+            
+            
+            place = fluid.CPUPlace()
+            feeder = fluid.DataFeeder(place=place, feed_list=feeds)
+            feeder.feed(batch_masks)
+
+    Args:
+        im_info(Variable): A 2-D Tensor with shape [N, 3]. N is the batch size,
+            each element is [height, width, scale] of image. Image scale is
+            target_size) / original_size.
+        gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the total
+            number of ground-truth, each element is a class label.
+        is_crowd(Variable): A 2-D LoDTensor with shape as gt_classes,
+            each element is a flag indicating whether a groundtruth is crowd.
+        gt_segms(Variable): This input is a 2D LoDTensor with shape [S, 2],
+            it's LoD level is 3. Usually users do not needs to understand LoD,
+            The users should return correct data format in reader.
+
+
+
+            The LoD[0] represents the gt objects number of
+            each instance. LoD[1] represents the segmentation counts of each
+            objects. LoD[2] represents the polygons number of each segmentation.
+            S the total number of polygons coordinate points. Each element is
+            (x, y) coordinate points.
+        rois(Variable): A 2-D LoDTensor with shape [R, 4]. R is the total
+            number of RoIs, each element is a bounding box with
+            (xmin, ymin, xmax, ymax) format in the range of original image.
+        labels_int32(Variable): A 2-D LoDTensor in shape of [R, 1] with type
+            of int32. R is the same as it in `rois`. Each element repersents
+            a class label of a RoI.
+        num_classes(int): Class number.
+        resolution(int): Resolution of mask predictions.
+
+    Returns:
+        mask_rois (Variable):  A 2D LoDTensor with shape [P, 4]. P is the total
+            number of sampled RoIs. Each element is a bounding box with
+            [xmin, ymin, xmax, ymax] format in range of orignal image size.
+        mask_rois_has_mask_int32 (Variable): A 2D LoDTensor with shape [P, 1],
+            each element repersents the output mask RoI index with regard to
+            to input RoIs.
+        mask_int32 (Variable): A 2D LoDTensor with shape [P, K * M * M],
+            K is the classes number and M is the resolution of mask predictions.
+            Each element repersents the binary mask targets.
+
+    Examples:
+        .. code-block:: python
+
+          im_info = fluid.layers.data(name="im_info", shape=[3],
+              dtype="float32")
+          gt_classes = fluid.layers.data(name="gt_classes", shape=[1],
+              dtype="float32", lod_level=1)
+          is_crowd = fluid.layers.data(name="is_crowd", shape=[1],
+              dtype="float32", lod_level=1)
+          gt_masks = fluid.layers.data(name="gt_masks", shape=[2],
+              dtype="float32", lod_level=3)
+          # rois, labels_int32 can be the output of
+          # fluid.layers.generate_proposal_labels.
+          mask_rois, mask_index, mask_int32 = fluid.layers.generate_mask_labels(
+              im_info=im_info,
+              gt_classes=gt_classes,
+              is_crowd=is_crowd,
+              gt_segms=gt_masks,
+              rois=rois,
+              labels_int32=labels_int32,
+              num_classes=81,
+              resolution=14)
+    """
+
+    helper = LayerHelper('generate_mask_labels', **locals())
+
+    mask_rois = helper.create_variable_for_type_inference(dtype=rois.dtype)
+    roi_has_mask_int32 = helper.create_variable_for_type_inference(
+        dtype=gt_classes.dtype)
+    mask_int32 = helper.create_variable_for_type_inference(
+        dtype=gt_classes.dtype)
+
+    helper.append_op(
+        type="generate_mask_labels",
+        inputs={
+            'ImInfo': im_info,
+            'GtClasses': gt_classes,
+            'IsCrowd': is_crowd,
+            'GtSegms': gt_segms,
+            'Rois': rois,
+            'LabelsInt32': labels_int32
+        },
+        outputs={
+            'MaskRois': mask_rois,
+            'RoiHasMaskInt32': roi_has_mask_int32,
+            'MaskInt32': mask_int32
+        },
+        attrs={'num_classes': num_classes,
+               'resolution': resolution})
+
+    mask_rois.stop_gradient = True
+    roi_has_mask_int32.stop_gradient = True
+    mask_int32.stop_gradient = True
+
+    return mask_rois, roi_has_mask_int32, mask_int32
+
+
 def generate_proposals(scores,
                        bbox_deltas,
                        im_info,
@@ -1754,33 +1889,48 @@ def generate_proposals(scores,
     """
     **Generate proposal Faster-RCNN**
 
-    This operation proposes RoIs according to each box with their probability to be a foreground object and 
-    the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
+    This operation proposes RoIs according to each box with their
+    probability to be a foreground object and 
+    the box can be calculated by anchors. Bbox_deltais and scores
+    to be an object are the output of RPN. Final proposals
     could be used to train detection net.
 
     For generating proposals, this operation performs following steps:
 
-    1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
+    1. Transposes and resizes scores and bbox_deltas in size of
+       (H*W*A, 1) and (H*W*A, 4)
     2. Calculate box locations as proposals candidates. 
     3. Clip boxes to image
     4. Remove predicted boxes with small area. 
     5. Apply NMS to get final proposals as output.
 
     Args:
-        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
-            N is batch size, A is number of anchors, H and W are height and width of the feature map.
-        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
-        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
+        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents
+            the probability for each box to be an object.
+            N is batch size, A is number of anchors, H and W are height and
+            width of the feature map.
+        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W]
+            represents the differece between predicted box locatoin and
+            anchor location.
+        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin
+            image information for N batch. Info contains height, width and scale
             between origin image size and the size of feature map.
-        anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
-                    num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-        variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
-        pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
-        post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
+        anchors(Variable):   A 4-D Tensor represents the anchors with a layout
+            of [H, W, A, 4]. H and W are height and width of the feature map,
+            num_anchors is the box count of each position. Each anchor is
+            in (xmin, ymin, xmax, ymax) format an unnormalized.
+        variances(Variable): The expanded variances of anchors with a layout of
+            [H, W, num_priors, 4]. Each variance is in
+            (xcenter, ycenter, w, h) format.
+        pre_nms_top_n(float): Number of total bboxes to be kept per
+            image before NMS. 6000 by default.
+        post_nms_top_n(float): Number of total bboxes to be kept per
+            image after NMS. 1000 by default.
         nms_thresh(float): Threshold in NMS, 0.5 by default.
-        min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
-        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
-
+        min_size(float): Remove predicted boxes with either height or
+            width < min_size. 0.1 by default.
+        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5,
+            adaptive_threshold = adaptive_threshold * eta in each iteration.
     """
     helper = LayerHelper('generate_proposals', **locals())
 
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 9a29b25093..1762bd3e34 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -523,7 +523,7 @@ def _py_reader(capacity,
         double_buffer_name = "_".join([name, "double_buffer"])
 
     var = global_scope().var(queue_name)
-    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity)
 
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=reader_name)
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index dde0518972..617704a531 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -321,7 +321,7 @@ def append_LARS(params_grads, learning_rate, weight_decay):
         The decayed learning rate
     Examples:
         .. code-block:: python
-        
+
             learning_rate *= local_gw_ratio * sqrt(sumsq(param))
                         / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
     """
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 503c91c27b..beb5e31211 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -22,7 +22,7 @@ import six
 import os
 import inspect
 from ..layer_helper import LayerHelper
-from ..initializer import Normal, Constant
+from ..initializer import Normal, Constant, NumpyArrayInitializer
 from ..framework import Variable, OpProtoHolder
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -179,6 +179,7 @@ __all__ = [
     'merge_selected_rows',
     'get_tensor_from_selected_rows',
     'lstm',
+    'shuffle_channel',
     'py_func',
     'psroi_pool',
     'teacher_student_sigmoid_loss',
@@ -2874,7 +2875,7 @@ def batch_norm(input,
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
     # setting stop_gradient=True to reduce computation
     if use_global_stats and helper.bias_attr.learning_rate == 0.:
-        scale.stop_gradient = True
+        bias.stop_gradient = True
 
     mean = helper.create_parameter(
         attr=ParamAttr(
@@ -3875,6 +3876,7 @@ def beam_search(pre_ids,
                 beam_size,
                 end_id,
                 level=0,
+                is_accumulated=True,
                 name=None):
     """
     Beam search is a classical algorithm for selecting candidate words in a
@@ -3887,14 +3889,17 @@ def beam_search(pre_ids,
     selects the top-K candidate word ids of current step from :attr:`ids`
     according to their :attr:`scores` for all source sentences, where K is
     :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
-    computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
-    the output of beam_search at previous step, they are needed for special use
-    to handle ended candidate translations.
-
-    Note that the :attr:`scores` passed in should be accumulated scores, and
-    length penalty should be done with extra operators before calculating the
-    accumulated scores if needed, also suggest finding top-K before it and
-    using the top-K candidates following.
+    computation cell. If :attr:`ids` is not set, it will be calculated out
+    according to :attr:`scores`. Additionally, :attr:`pre_ids` and
+    :attr:`pre_scores` are the output of beam_search at previous step, they
+    are needed for special use to handle ended candidate translations.
+
+    Note that if :attr:`is_accumulated` is :attr:`True`, the :attr:`scores`
+    passed in should be accumulated scores. Else, the :attr:`scores` are
+    considered as the straightforward scores and will be transformed to the
+    log field and accumulated the :attr:`pre_scores` in this operator.
+    Length penalty should be done with extra operators before calculating the
+    accumulated scores if needed.
 
     Please see the following demo for a fully beam search usage example:
 
@@ -3924,6 +3929,8 @@ def beam_search(pre_ids,
             describes how these candidates belong to the prefix. The paths
             linking prefixes and selected candidates are organized and reserved
             in lod.
+        is_accumulated(bool, default True): Whether the input :attr:`score` is
+             accumulated scores.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
 
@@ -3952,8 +3959,12 @@ def beam_search(pre_ids,
                 end_id=end_id)
     """
     helper = LayerHelper('beam_search', **locals())
-    score_type = scores.dtype
-    id_type = ids.dtype
+    score_type = pre_scores.dtype
+    id_type = pre_ids.dtype
+
+    inputs = {"pre_ids": pre_ids, "pre_scores": pre_scores, "scores": scores}
+    if ids is not None:
+        inputs["ids"] = ids
 
     selected_scores = helper.create_variable_for_type_inference(
         dtype=score_type)
@@ -3961,12 +3972,7 @@ def beam_search(pre_ids,
 
     helper.append_op(
         type='beam_search',
-        inputs={
-            'pre_ids': pre_ids,
-            'pre_scores': pre_scores,
-            'ids': ids,
-            'scores': scores,
-        },
+        inputs=inputs,
         outputs={
             'selected_ids': selected_ids,
             'selected_scores': selected_scores,
@@ -3976,6 +3982,7 @@ def beam_search(pre_ids,
             'level': level,
             'beam_size': beam_size,
             'end_id': end_id,
+            'is_accumulated': is_accumulated,
         })
 
     return selected_ids, selected_scores
@@ -5146,9 +5153,9 @@ def nce(input,
         littles = []
         for i in range(custom_dist_len):
             normal_prob = custom_dist[i] * custom_dist_len
-            if normal_prob - 1.0 > 1e-4:
+            if normal_prob - 1.0 > 0:
                 bigs.append((i, normal_prob))
-            elif 1.0 - normal_prob > 1e-4:
+            elif 1.0 - normal_prob > 0:
                 littles.append((i, normal_prob))
             else:
                 alias_probs_[i] = normal_prob
@@ -5164,9 +5171,9 @@ def nce(input,
             alias_probs_[little[0]] = little[1]
             alias_[little[0]] = big_idx
             big_left = big[1] + little[1] - 1
-            if big_left - 1.0 > 1e-4:
+            if big_left - 1.0 > 0:
                 bigs.append((big_idx, big_left))
-            elif 1.0 - big_left > 1e-4:
+            elif 1.0 - big_left > 0:
                 littles.append((big_idx, big_left))
             else:
                 alias_probs_[big_idx] = big_left
@@ -5181,14 +5188,21 @@ def nce(input,
             alias_probs_[little[0]] = 1.0
             alias_[little[0]] = -1
 
-        probs = assign(input=np.array(custom_dist).astype('float32'))
-        custom_alias = assign(input=np.array(alias_).astype('int32'))
-        custom_alias_probs = assign(
-            input=np.array(alias_probs_).astype('float32'))
-
-        inputs['CustomDistProbs'] = probs
-        inputs['CustomDistAlias'] = custom_alias
-        inputs['CustomDistAliasProbs'] = custom_alias_probs
+        def _init_by_numpy_array(numpy_array):
+            ret = helper.create_parameter(
+                attr=ParamAttr(),
+                shape=numpy_array.shape,
+                dtype=numpy_array.dtype,
+                default_initializer=NumpyArrayInitializer(numpy_array))
+            ret.stop_gradient = True
+            return ret
+
+        inputs['CustomDistProbs'] = _init_by_numpy_array(
+            np.array(custom_dist).astype('float32'))
+        inputs['CustomDistAlias'] = _init_by_numpy_array(
+            np.array(alias_).astype('int32'))
+        inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
+            np.array(alias_probs_).astype('float32'))
         sampler = 2
     else:
         raise Exception("Unsupported sampler type.")
@@ -5849,7 +5863,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
             type='increment',
             inputs={'X': [counter]},
             outputs={'Out': [counter]},
-            attrs={'step': float(step)})
+            attrs={'step': float(step)},
+            stop_gradient=True)
         counter.stop_gradient = True
 
     return counter
@@ -8927,7 +8942,8 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 def sigmoid_cross_entropy_with_logits(x,
                                       label,
                                       ignore_index=kIgnoreIndex,
-                                      name=None):
+                                      name=None,
+                                      normalize=False):
     """
     ${comment}
 
@@ -8936,9 +8952,25 @@ def sigmoid_cross_entropy_with_logits(x,
         label(${label_type}): ${label_comment}
         ignore_index(&{ignore_index}): ${ignore_index_comment}
         name(basestring|None): Name of the output.
+        normalize(bool): If true, divide the output by the number of
+            targets != ignore_index.
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(
+                name='data', shape=[10], dtype='float32')
+            label = fluid.layers.data(
+                name='data', shape=[10], dtype='float32')
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=input,
+                label=label,
+                ignore_index=-1,
+                normalize=True) # or False
+            # loss = fluid.layers.reduce_sum(loss) # summation of loss
     """
 
     helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
@@ -8953,7 +8985,8 @@ def sigmoid_cross_entropy_with_logits(x,
         type="sigmoid_cross_entropy_with_logits",
         inputs={"X": x,
                 "Label": label},
-        attrs={"ignore_index": ignore_index},
+        attrs={"ignore_index": ignore_index,
+               'normalize': normalize},
         outputs={"Out": out})
     return out
 
@@ -9450,7 +9483,7 @@ def teacher_student_sigmoid_loss(input,
                                 by the previous operator.
         label (Variable|list):  the ground truth which is a 2-D tensor with
                                 shape [N x 1], where N is the batch size.
-        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound 
+        soft_max_up_bound  (float):  if input > soft_max_up_bound, will be bound
         soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound
 
     Returns:
@@ -9614,6 +9647,79 @@ def get_tensor_from_selected_rows(x, name=None):
     return out
 
 
+def shuffle_channel(x, group, name=None):
+    """
+    **Shuffle Channel Operator**
+
+    This operator shuffles the channels of input x.
+    It divide the input channels in each group into :attr:`group` subgroups,
+    and obtain a new order by selecting element from every subgroup one by one.
+
+    Please refer to the paper
+    https://arxiv.org/pdf/1707.01083.pdf
+    
+    .. code-block:: text
+
+        Given a 4-D tensor input with the shape (N, C, H, W):
+            input.shape = (1, 4, 2, 2)
+            input.data =[[[[0.1, 0.2],
+                           [0.2, 0.3]],
+
+                          [[0.3, 0.4],
+                           [0.4, 0.5]],
+
+                          [[0.5, 0.6],
+                           [0.6, 0.7]],
+
+                          [[0.7, 0.8],
+                           [0.8, 0.9]]]]
+            Given group: 2
+            then we get a 4-D tensor out whth the same shape of input:
+            out.shape = (1, 4, 2, 2)
+            out.data = [[[[0.1, 0.2],
+                          [0.2, 0.3]],
+                          
+                         [[0.5, 0.6],
+                          [0.6, 0.7]],
+                          
+                         [[0.3, 0.4],
+                          [0.4, 0.5]],
+                          
+                         [[0.7, 0.8],
+                          [0.8, 0.9]]]]
+                        
+    Args: 
+        x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W]
+        group(int): Indicating the conuts of subgroups, It should divide the number of channels.
+
+    Returns:
+        out(Variable): the channels shuffling result is a tensor variable with the 
+        same shape and same type as the input.
+
+    Raises:
+        ValueError: If group is not an int type variable.
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32')
+            out = fluid.layers.shuffle_channel(x=input, group=2)
+    """
+    helper = LayerHelper("shuffle_channel", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    if not isinstance(group, int):
+        raise TypeError("group must be int type")
+
+    helper.append_op(
+        type="shuffle_channel",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"group": group})
+    return out
+
+
 class PyFuncRegistry(object):
     _register_funcs = []
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index ce9f508c9f..2153ca254f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -382,7 +382,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
             'dtype': out.dtype,
             'value': float(value),
             'force_cpu': force_cpu or force_init_on_cpu()
-        })
+        },
+        stop_gradient=True)
     out.stop_gradient = True
     return out
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index b72b900d3b..14f4276e2f 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -301,10 +301,10 @@ class Optimizer(object):
             no_grad_set (set|None): set of Variables should be ignored.
             callbacks (list|None): list of callables to run when appending backward
                 operator for one parameter.
-        
+
         Return:
             list: list of (param, grad) pair, grad is the output of backward.
-        
+
         Examples:
             See examples in `apply_gradients`.
         """
@@ -322,10 +322,10 @@ class Optimizer(object):
 
         Args:
             params_grads (list): list of (param, grad) pair to do optimization.
-        
+
         Returns:
             list: A list of operators appended to the current program.
-        
+
         Examples:
             .. code-block:: python
 
@@ -364,7 +364,7 @@ class Optimizer(object):
 
         This method combines interface `backward()` and
         `apply_gradients()` into one.
-        
+
         Args:
             loss (Variable): loss variable to run optimizations.
             startup_program (Program): startup_program for initializing parameters
@@ -381,18 +381,21 @@ class Optimizer(object):
         optimize_ops = []
         if imperative_base.enabled():
             if parameter_list is not None:
-                params_grads = parameter_list
+                parameters = parameter_list
             else:
                 parameters = program.global_block().all_parameters()
-                params_grads = []
-                for param in parameters:
-                    # create gradient variable
-                    grad_var = Variable(
-                        block=loss.block,
-                        name=param._ivar._grad_name(),
-                        stop_gradient=True,
-                        ivar=param._ivar._grad_ivar())
-                    params_grads.append((param, grad_var))
+
+            params_grads = []
+            for param in parameters:
+                if param.stop_gradient:
+                    continue
+                # create gradient variable
+                grad_var = Variable(
+                    block=loss.block,
+                    name=param._ivar._grad_name(),
+                    stop_gradient=True,
+                    ivar=param._ivar._grad_ivar())
+                params_grads.append((param, grad_var))
             with program_guard(program, startup_program):
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index a1b1d2f584..a07ff6ac69 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -159,7 +159,7 @@ class ParallelExecutor(object):
         trainers_endpoints = main._trainers_endpoints
         if num_trainers > 1 and trainers_endpoints:
             assert num_trainers == len(
-                trainers_endpoints), "num_trainers == len(end_points)"
+                trainers_endpoints), "num_trainers == len(endpoints)"
             build_strategy.trainers_endpoints = trainers_endpoints
 
         # step6: get persistable_vars, places. persistable_vars
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index e24a9aa989..487a29c839 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -195,9 +195,32 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].recursive_sequence_lengths())
+
+        def to_infer_tensor(lod_tensor):
+            infer_tensor = fluid.core.PaddleTensor()
+            infer_tensor.lod = lod_tensor.lod()
+            infer_tensor.data = fluid.core.PaddleBuf(np.array(lod_tensor))
+            infer_tensor.shape = lod_tensor.shape()
+            infer_tensor.dtype = fluid.core.PaddleDType.INT64
+            return infer_tensor
+
+        infer_inputs = [first_word, second_word, third_word, fourth_word]
+        infer_inputs = [to_infer_tensor(t) for t in infer_inputs]
+
+        infer_config = fluid.core.NativeConfig()
+        infer_config.model_dir = 'word2vec.inference.model'
+        infer_config.use_gpu = use_cuda
+        if use_cuda:
+            infer_config.device = 0
+            infer_config.fraction_of_gpu_memory = 0.15
+        compiled_program = fluid.compiler.CompiledProgram(inference_program)
+        compiled_program.with_inference_optimize(infer_config)
+        assert compiled_program._is_inference is True
+        infer_outputs = exe.run(compiled_program, feed=infer_inputs)
         np_data = np.array(results[0])
-        print("Inference Shape: ", np_data.shape)
+        infer_out = infer_outputs[0].data.float_data()
+        for a, b in zip(np_data[0], infer_out):
+            assert np.isclose(a, b), "a: {}, b: {}".format(a, b)
 
 
 def main(use_cuda, is_sparse, is_parallel):
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index d99eaa0634..2d9ed9f9c6 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -203,7 +203,7 @@ class TestGenerateProposalLabels(unittest.TestCase):
                 lod_level=1,
                 append_batch_size=False)
             class_nums = 5
-            rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels(
+            outs = fluid.layers.generate_proposal_labels(
                 rpn_rois=rpn_rois,
                 gt_classes=gt_classes,
                 is_crowd=is_crowd,
@@ -216,6 +216,11 @@ class TestGenerateProposalLabels(unittest.TestCase):
                 bg_thresh_lo=0.0,
                 bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
                 class_nums=class_nums)
+            rois = outs[0]
+            labels_int32 = outs[1]
+            bbox_targets = outs[2]
+            bbox_inside_weights = outs[3]
+            bbox_outside_weights = outs[4]
             assert rois.shape[1] == 4
             assert rois.shape[0] == labels_int32.shape[0]
             assert rois.shape[0] == bbox_targets.shape[0]
@@ -226,6 +231,62 @@ class TestGenerateProposalLabels(unittest.TestCase):
             assert bbox_outside_weights.shape[1] == 4 * class_nums
 
 
+class TestGenerateMaskLabels(unittest.TestCase):
+    def test_generate_mask_labels(self):
+        program = Program()
+        with program_guard(program):
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_classes = layers.data(
+                name='gt_classes',
+                shape=[2, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[2, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_segms = layers.data(
+                name='gt_segms',
+                shape=[20, 2],
+                dtype='float32',
+                lod_level=3,
+                append_batch_size=False)
+            rois = layers.data(
+                name='rois',
+                shape=[4, 4],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            labels_int32 = layers.data(
+                name='labels_int32',
+                shape=[4, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            num_classes = 5
+            resolution = 14
+            outs = fluid.layers.generate_mask_labels(
+                im_info=im_info,
+                gt_classes=gt_classes,
+                is_crowd=is_crowd,
+                gt_segms=gt_segms,
+                rois=rois,
+                labels_int32=labels_int32,
+                num_classes=num_classes,
+                resolution=resolution)
+            mask_rois, roi_has_mask_int32, mask_int32 = outs
+            assert mask_rois.shape[1] == 4
+            assert mask_int32.shape[1] == num_classes * resolution * resolution
+
+
 class TestMultiBoxHead(unittest.TestCase):
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
@@ -313,7 +374,7 @@ class TestRpnTargetAssign(unittest.TestCase):
                 name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
             is_crowd = layers.data(
                 name='is_crowd',
-                shape=[10],
+                shape=[1, 10],
                 dtype='int32',
                 lod_level=1,
                 append_batch_size=False)
@@ -323,7 +384,7 @@ class TestRpnTargetAssign(unittest.TestCase):
                 dtype='float32',
                 lod_level=1,
                 append_batch_size=False)
-            pred_scores, pred_loc, tgt_lbl, tgt_bbox, bbox_inside_weight = layers.rpn_target_assign(
+            outs = layers.rpn_target_assign(
                 bbox_pred=bbox_pred,
                 cls_logits=cls_logits,
                 anchor_box=anchor_box,
@@ -337,6 +398,11 @@ class TestRpnTargetAssign(unittest.TestCase):
                 rpn_positive_overlap=0.7,
                 rpn_negative_overlap=0.3,
                 use_random=False)
+            pred_scores = outs[0]
+            pred_loc = outs[1]
+            tgt_lbl = outs[2]
+            tgt_bbox = outs[3]
+            bbox_inside_weight = outs[4]
 
             self.assertIsNotNone(pred_scores)
             self.assertIsNotNone(pred_loc)
@@ -351,41 +417,43 @@ class TestRpnTargetAssign(unittest.TestCase):
 
 class TestGenerateProposals(unittest.TestCase):
     def test_generate_proposals(self):
-        data_shape = [20, 64, 64]
-        images = fluid.layers.data(
-            name='images', shape=data_shape, dtype='float32')
-        im_info = fluid.layers.data(
-            name='im_info', shape=[1, 3], dtype='float32')
-        anchors, variances = fluid.layers.anchor_generator(
-            name='anchor_generator',
-            input=images,
-            anchor_sizes=[32, 64],
-            aspect_ratios=[1.0],
-            variance=[0.1, 0.1, 0.2, 0.2],
-            stride=[16.0, 16.0],
-            offset=0.5)
-        num_anchors = anchors.shape[2]
-        scores = fluid.layers.data(
-            name='scores', shape=[1, num_anchors, 8, 8], dtype='float32')
-        bbox_deltas = fluid.layers.data(
-            name='bbox_deltas',
-            shape=[1, num_anchors * 4, 8, 8],
-            dtype='float32')
-        rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
-            name='generate_proposals',
-            scores=scores,
-            bbox_deltas=bbox_deltas,
-            im_info=im_info,
-            anchors=anchors,
-            variances=variances,
-            pre_nms_top_n=6000,
-            post_nms_top_n=1000,
-            nms_thresh=0.5,
-            min_size=0.1,
-            eta=1.0)
-        self.assertIsNotNone(rpn_rois)
-        self.assertIsNotNone(rpn_roi_probs)
-        print(rpn_rois.shape)
+        program = Program()
+        with program_guard(program):
+            data_shape = [20, 64, 64]
+            images = fluid.layers.data(
+                name='images', shape=data_shape, dtype='float32')
+            im_info = fluid.layers.data(
+                name='im_info', shape=[3], dtype='float32')
+            anchors, variances = fluid.layers.anchor_generator(
+                name='anchor_generator',
+                input=images,
+                anchor_sizes=[32, 64],
+                aspect_ratios=[1.0],
+                variance=[0.1, 0.1, 0.2, 0.2],
+                stride=[16.0, 16.0],
+                offset=0.5)
+            num_anchors = anchors.shape[2]
+            scores = fluid.layers.data(
+                name='scores', shape=[num_anchors, 8, 8], dtype='float32')
+            bbox_deltas = fluid.layers.data(
+                name='bbox_deltas',
+                shape=[num_anchors * 4, 8, 8],
+                dtype='float32')
+            rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
+                name='generate_proposals',
+                scores=scores,
+                bbox_deltas=bbox_deltas,
+                im_info=im_info,
+                anchors=anchors,
+                variances=variances,
+                pre_nms_top_n=6000,
+                post_nms_top_n=1000,
+                nms_thresh=0.5,
+                min_size=0.1,
+                eta=1.0)
+            self.assertIsNotNone(rpn_rois)
+            self.assertIsNotNone(rpn_roi_probs)
+            print(rpn_rois.shape)
 
 
 class TestYoloDetection(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 808e1e6aa8..c23dfa01e7 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -84,6 +84,7 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
+list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -91,6 +92,8 @@ py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
+py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
+  FLAGS_cudnn_deterministic=1)
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
index faec535042..f0f13a9d49 100644
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -80,7 +80,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
         # NOTE: pserver should not call memory optimize
         t = self.get_transpiler(args.trainer_id,
                                 fluid.default_main_program(), args.endpoints,
-                                args.trainers, args.sync_mode)
+                                args.trainers, args.sync_mode, False,
+                                args.current_endpoint)
         pserver_prog = t.get_pserver_program(args.current_endpoint)
         startup_prog = t.get_startup_program(args.current_endpoint,
                                              pserver_prog)
@@ -93,7 +94,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
         exe.run(startup_prog)
 
         if need_load and model_dir:
-            self._load_persistable_vars(exe, model_dir, startup_prog)
+            fluid.io.load_persistables(exe, model_dir, pserver_prog)
+
         exe.run(pserver_prog)
 
     def run_trainer(self, args):
@@ -158,19 +160,46 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
 
         need_save = bool(int(os.getenv("SAVE", "0")))
         model_dir = os.getenv("MODEL_DIR", "")
-
-        if need_save:
-            for _ in six.moves.xrange(RUN_STEP):
-                loss, = exe.run(fetch_list=[avg_cost.name],
-                                feed=feeder.feed(get_data()))
-            if need_save and model_dir:
-                io.save_persistables(startup_exe, model_dir, trainer_prog)
-
-        var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
-        if six.PY2:
-            print(pickle.dumps(np.ravel(var).tolist()))
+        save_mode = os.getenv("SAVE_MODE", "")
+
+        if save_mode == "LOCAL":
+            if need_save:
+                for _ in six.moves.xrange(RUN_STEP):
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(get_data()))
+                if need_save and model_dir:
+                    io.save_persistables(startup_exe, model_dir, trainer_prog)
+
+            var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor(
+            ))
+            if six.PY2:
+                print(pickle.dumps(np.ravel(var).tolist()))
+            else:
+                sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
+
+        elif save_mode == "DIST":
+            skip_steps = int(os.getenv("SKIP_STEPS"))
+            loss = None
+            if need_save:
+                for idx in six.moves.xrange(8):
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(get_data()))
+                    if need_save and model_dir and idx == skip_steps and args.trainer_id == 0:
+                        io.save_persistables(startup_exe, model_dir,
+                                             trainer_prog)
+            else:
+                for idx in six.moves.xrange(8):
+                    data = get_data()
+                    if idx <= skip_steps:
+                        continue
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(data))
+            if six.PY2:
+                print(pickle.dumps(loss.tolist()))
+            else:
+                sys.stdout.buffer.write(pickle.dumps(loss.tolist()))
         else:
-            sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
+            raise Exception("save_mode must be LOCAL or DIST")
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
index fac5e037a4..09afae6114 100644
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
@@ -75,9 +75,13 @@ def get_loss(cos_q_pt, cos_q_nt):
     return avg_cost
 
 
-def get_optimizer():
-    # SGD optimizer
-    optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+def get_optimizer(op="sgd"):
+    if op.upper() == "sgd".upper():
+        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+    elif op.upper() == "adam".upper():
+        optimizer = fluid.optimizer.Adam(learning_rate=base_lr)
+    else:
+        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
     return optimizer
 
 
@@ -237,7 +241,8 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
         inference_program = fluid.default_main_program().clone()
 
         # Optimization
-        opt = get_optimizer()
+        opt = os.getenv('OPTIMIZER', 'sgd')
+        opt = get_optimizer(opt)
         opt.minimize(avg_cost)
 
         # Reader
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 3fcdc57906..0968ace62b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -43,7 +43,8 @@ class TestDistRunnerBase(object):
                        pserver_endpoints,
                        trainers,
                        sync_mode,
-                       dc_asgd=False):
+                       dc_asgd=False,
+                       current_endpoint=None):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
         config = fluid.DistributeTranspilerConfig()
         config.enable_dc_asgd = dc_asgd
@@ -53,7 +54,8 @@ class TestDistRunnerBase(object):
             program=main_program,
             pservers=pserver_endpoints,
             trainers=trainers,
-            sync_mode=sync_mode)
+            sync_mode=sync_mode,
+            current_endpoint=current_endpoint)
         return t
 
     def run_pserver(self, args):
@@ -122,8 +124,8 @@ class TestDistRunnerBase(object):
         if args.batch_merge_repeat > 1:
             pass_builder = build_stra._finalize_strategy_and_create_passes()
             mypass = pass_builder.insert_pass(
-                len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
-            mypass.set_int("num_repeats", args.batch_merge_repeat)
+                len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass")
+            mypass.set("num_repeats", args.batch_merge_repeat)
 
         if args.update_method == "nccl2":
             build_stra.num_trainers = len(args.endpoints.split(","))
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
index 4588ca7c17..e795bc410e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -33,7 +33,6 @@ class TestDistSaveLoadDense2x2(TestDistBase):
                          delta=1e-3,
                          check_error_log=False,
                          need_envs={}):
-
         required_envs = {
             "PATH": os.getenv("PATH", ""),
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
@@ -77,7 +76,77 @@ class TestDistSaveLoadDense2x2(TestDistBase):
         need_envs = {
             "IS_DISTRIBUTED": '0',
             "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1'
+            'IS_SELF_CONTAINED_LR': '1',
+            'SAVE_MODE': 'LOCAL',
+        }
+        self.check_with_place(
+            "dist_save_load.py",
+            delta=0,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "http_proxy": ""
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        model_dir = tempfile.mkdtemp()
+
+        save_env = {}
+        save_env["SAVE_MODE"] = "DIST"
+        save_env["SAVE"] = "1"
+        save_env["MODEL_DIR"] = model_dir
+        save_env.update(required_envs)
+
+        tr0_var_1, tr1_var_1 = self._run_cluster(model_file, save_env,
+                                                 check_error_log)
+
+        load_env = {}
+        load_env["LOAD"] = "1"
+        load_env["MODEL_DIR"] = model_dir
+        load_env.update(required_envs)
+        tr0_var_2, tr1_var_2 = self._run_cluster(model_file, load_env,
+                                                 check_error_log)
+
+        shutil.rmtree(model_dir)
+
+        train0_1_np = np.array(tr0_var_1)
+        train1_1_np = np.array(tr1_var_1)
+        train0_2_np = np.array(tr0_var_2)
+        train1_2_np = np.array(tr1_var_2)
+
+        self.assertAlmostEqual(
+            train0_1_np.all(), train0_2_np.all(), delta=delta)
+        self.assertAlmostEqual(
+            train1_1_np.all(), train1_2_np.all(), delta=delta)
+
+    def test_dist(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1',
+            'SAVE_MODE': 'DIST',
+            'OPTIMIZER': 'ADAM',
+            'SKIP_STEPS': str(np.random.randint(2, 6))
         }
         self.check_with_place(
             "dist_save_load.py",
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 3d1ce6b27c..3566fed215 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -741,21 +741,40 @@ class TestLoadSliceVar(TranspilerTest):
         pserver, _ = self.get_pserver(self.pserver1_ep)
         pserver2, _ = self.get_pserver(self.pserver2_ep)
 
-        self.assertTrue(pserver._slice_vars_and_attrs)
-        self.assertTrue(pserver2._slice_vars_and_attrs)
-
-        for idx in six.moves.xrange(len(pserver._slice_vars_and_attrs)):
-            self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
-                             pserver2._slice_vars_and_attrs[idx][0])
-
-            total_numel = six.moves.reduce(
-                lambda x, y: x * y, pserver._slice_vars_and_attrs[idx][0].shape)
-            self.assertEqual(
-                total_numel,
-                six.moves.reduce(lambda x, y: x * y,
-                                 pserver._slice_vars_and_attrs[idx][2].shape) +
-                six.moves.reduce(lambda x, y: x * y,
-                                 pserver2._slice_vars_and_attrs[idx][2].shape))
+        vars_ps1 = pserver._parameters_on_pservers.get_distributed_vars_by_ep(
+            self.pserver1_ep)
+        vars_ps2 = pserver._parameters_on_pservers.get_distributed_vars_by_ep(
+            self.pserver2_ep)
+
+        self.assertTrue(vars_ps1)
+        self.assertTrue(vars_ps2)
+
+        for idx in six.moves.xrange(len(vars_ps1)):
+            total_numel = 0
+            ps1_numel, ps2_numel = 0, 0
+
+            ps1_var = vars_ps1[idx]
+
+            if not ps1_var.is_slice:
+                total_numel = six.moves.reduce(lambda x, y: x * y,
+                                               vars_ps1[idx].origin.shape)
+                ps1_numel = six.moves.reduce(lambda x, y: x * y,
+                                             vars_ps1[idx].slice.shape)
+            else:
+                ps2_var = None
+                for var in vars_ps2:
+                    if var.origin.name == ps1_var.origin.name:
+                        ps2_var = var
+                        break
+
+                total_numel = six.moves.reduce(lambda x, y: x * y,
+                                               ps1_var.origin.shape)
+                ps1_numel = six.moves.reduce(lambda x, y: x * y,
+                                             ps1_var.slice.shape)
+                ps2_numel = six.moves.reduce(lambda x, y: x * y,
+                                             ps2_var.slice.shape)
+
+            self.assertEqual(total_numel, ps1_numel + ps2_numel)
 
 
 class TestNCCL2Transpile(TranspilerTest):
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
index 7ec1f0ae75..56dfb095de 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -16,12 +16,17 @@ import os
 import unittest
 os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
 
+# FIXME(zjl): It seems that this unittest fails randomly 
+# when comparing all reduce last loss and reduce last loss
+# e.g.: AssertionError: 1.0357145 != 1.0673475 within 0.01 delta
+# Disable it temporarily.
+'''
 from test_parallel_executor_mnist import TestMNIST
 
 
 class EagerDeletionTestMNIST(TestMNIST):
     pass
-
+'''
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
new file mode 100644
index 0000000000..1d7ce33ea7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
@@ -0,0 +1,421 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+import six
+import paddle.fluid as fluid
+from op_test import OpTest
+'''
+# Equivalent code
+rles = mask_util.frPyObjects([segm], im_h, im_w)
+mask = mask_util.decode(rles)
+'''
+
+
+def decode(cnts, m):
+    v = 0
+    mask = []
+    for j in range(m):
+        for k in range(cnts[j]):
+            mask.append(v)
+        v = 1 - v
+    return mask
+
+
+def poly2mask(xy, k, h, w):
+    scale = 5.
+    x = [int(scale * p + 0.5) for p in xy[::2]]
+    x = x + [x[0]]
+    y = [int(scale * p + 0.5) for p in xy[1::2]]
+    y = y + [y[0]]
+    m = sum([
+        int(max(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1]))) + int(1)
+        for j in range(k)
+    ])
+
+    u, v = [], []
+    for j in range(k):
+        xs = x[j]
+        xe = x[j + 1]
+        ys = y[j]
+        ye = y[j + 1]
+        dx = abs(xe - xs)
+        dy = abs(ys - ye)
+        flip = (dx >= dy and xs > xe) or (dx < dy and ys > ye)
+        if flip:
+            xs, xe = xe, xs
+            ys, ye = ye, ys
+
+        if dx >= dy:
+            if (dx == 0): assert ye - ys == 0
+            s = 0 if dx == 0 else float(ye - ys) / dx
+        else:
+            if (dy == 0): assert xe - xs == 0
+            s = 0 if dy == 0 else float(xe - xs) / dy
+
+        if dx >= dy:
+            ts = [dx - d if flip else d for d in range(dx + 1)]
+            u.extend([xs + t for t in ts])
+            v.extend([int(ys + s * t + .5) for t in ts])
+        else:
+            ts = [dy - d if flip else d for d in range(dy + 1)]
+            v.extend([t + ys for t in ts])
+            u.extend([int(xs + s * t + .5) for t in ts])
+
+    k = len(u)
+    x = np.zeros((k), np.int)
+    y = np.zeros((k), np.int)
+    m = 0
+    for j in six.moves.xrange(1, k):
+        if u[j] != u[j - 1]:
+            xd = float(u[j] if (u[j] < u[j - 1]) else (u[j] - 1))
+            xd = (xd + .5) / scale - .5
+            if (math.floor(xd) != xd or xd < 0 or xd > (w - 1)):
+                continue
+            yd = float(v[j] if v[j] < v[j - 1] else v[j - 1])
+            yd = (yd + .5) / scale - .5
+            yd = math.ceil(0 if yd < 0 else (h if yd > h else yd))
+            x[m] = int(xd)
+            y[m] = int(yd)
+            m += 1
+    k = m
+    a = [int(x[i] * h + y[i]) for i in range(k)]
+    a.append(h * w)
+    a.sort()
+    b = [0] + a[:len(a) - 1]
+    a = [c - d for (c, d) in zip(a, b)]
+
+    k += 1
+    b = [0 for i in range(k)]
+    b[0] = a[0]
+    m, j = 1, 1
+    while (j < k):
+        if a[j] > 0:
+            b[m] = a[j]
+            m += 1
+            j += 1
+        else:
+            j += 1
+            if (j < k):
+                b[m - 1] += a[j]
+                j += 1
+    mask = decode(b, m)
+    mask = np.array(mask, dtype=np.int).reshape((w, h))
+    mask = mask.transpose((1, 0))
+    return mask
+
+
+def polys_to_boxes(polys):
+    """Convert a list of polygons into an array of tight bounding boxes."""
+    boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
+    for i in range(len(polys)):
+        poly = polys[i]
+        x0 = min(min(p[::2]) for p in poly)
+        x1 = max(max(p[::2]) for p in poly)
+        y0 = min(min(p[1::2]) for p in poly)
+        y1 = max(max(p[1::2]) for p in poly)
+        boxes_from_polys[i, :] = [x0, y0, x1, y1]
+    return boxes_from_polys
+
+
+def bbox_overlaps(boxes, query_boxes):
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) *\
+                   (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        for n in range(N):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) -\
+                 max(boxes[n, 0], query_boxes[k, 0]) + 1
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) -\
+                     max(boxes[n, 1], query_boxes[k, 1]) + 1
+                if ih > 0:
+                    ua = float(
+                         (boxes[n, 2] - boxes[n, 0] + 1) *\
+                         (boxes[n, 3] - boxes[n, 1] + 1) +\
+                         box_area - iw * ih)
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def polys_to_mask_wrt_box(polygons, box, M):
+    """Convert from the COCO polygon segmentation format to a binary mask
+    encoded as a 2D array of data type numpy.float32. The polygon segmentation
+    is understood to be enclosed in the given box and rasterized to an M x M
+    mask. The resulting mask is therefore of shape (M, M).
+    """
+    w = box[2] - box[0]
+    h = box[3] - box[1]
+
+    w = np.maximum(w, 1)
+    h = np.maximum(h, 1)
+
+    polygons_norm = []
+    for poly in polygons:
+        p = np.array(poly, dtype=np.float32)
+        p[0::2] = (p[0::2] - box[0]) * M / w
+        p[1::2] = (p[1::2] - box[1]) * M / h
+        polygons_norm.append(p)
+
+    mask = []
+    for polygons in polygons_norm:
+        assert polygons.shape[0] % 2 == 0
+        k = polygons.shape[0] // 2
+        mask.append(poly2mask(polygons, k, M, M))
+    mask = np.array(mask)
+    # Flatten in case polygons was a list
+    mask = np.sum(mask, axis=0)
+    mask = np.array(mask > 0, dtype=np.float32)
+    return mask
+
+
+def expand_mask_targets(masks, mask_class_labels, resolution, num_classes):
+    """Expand masks from shape (#masks, resolution ** 2)
+    to (#masks, #classes * resolution ** 2) to encode class
+    specific mask targets.
+    """
+    assert masks.shape[0] == mask_class_labels.shape[0]
+
+    # Target values of -1 are "don't care" / ignore labels
+    mask_targets = -np.ones(
+        (masks.shape[0], num_classes * resolution**2), dtype=np.int32)
+    for i in range(masks.shape[0]):
+        cls = int(mask_class_labels[i])
+        start = resolution**2 * cls
+        end = start + resolution**2
+        # Ignore background instance
+        # (only happens when there is no fg samples in an image)
+        if cls > 0:
+            mask_targets[i, start:end] = masks[i, :]
+    return mask_targets
+
+
+def generate_mask_labels(num_classes, im_info, gt_classes, is_crowd,
+                         label_int32, gt_polys, resolution, rois, roi_lod,
+                         gt_lod):
+    mask_rois = []
+    roi_has_mask_int32 = []
+    mask_int32 = []
+    new_lod = []
+    for i in range(len(im_info)):
+        roi_s = roi_lod[i]
+        roi_e = roi_lod[i + 1]
+        gt_s = gt_lod[i]
+        gt_e = gt_lod[i + 1]
+        mask_blob = _sample_mask(num_classes, im_info[i], gt_classes[gt_s:gt_e],
+                                 is_crowd[gt_s:gt_e], label_int32[roi_s:roi_e],
+                                 gt_polys[i], resolution, rois[roi_s:roi_e])
+        new_lod.append(mask_blob['mask_rois'].shape[0])
+        mask_rois.append(mask_blob['mask_rois'])
+        roi_has_mask_int32.append(mask_blob['roi_has_mask_int32'])
+        mask_int32.append(mask_blob['mask_int32'])
+    return mask_rois, roi_has_mask_int32, mask_int32, new_lod
+
+
+def _sample_mask(
+        num_classes,
+        im_info,
+        gt_classes,
+        is_crowd,
+        label_int32,
+        gt_polys,  # [[[], []], []]
+        resolution,
+        rois):
+    mask_blob = {}
+    im_scale = im_info[2]
+    sample_boxes = rois
+    polys_gt_inds = np.where((gt_classes > 0) & (is_crowd == 0))[0]
+    polys_gt = [gt_polys[i] for i in polys_gt_inds]
+    boxes_from_polys = polys_to_boxes(polys_gt)
+
+    fg_inds = np.where(label_int32 > 0)[0]
+    roi_has_mask = fg_inds.copy()
+    if fg_inds.shape[0] > 0:
+        mask_class_labels = label_int32[fg_inds]
+        masks = np.zeros((fg_inds.shape[0], resolution**2), dtype=np.int32)
+        rois_fg = sample_boxes[fg_inds]
+        overlaps_bbfg_bbpolys = bbox_overlaps(
+            rois_fg.astype(np.float32), boxes_from_polys.astype(np.float32))
+        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)
+        for i in range(rois_fg.shape[0]):
+            fg_polys_ind = fg_polys_inds[i]
+            poly_gt = polys_gt[fg_polys_ind]
+            roi_fg = rois_fg[i]
+            mask = polys_to_mask_wrt_box(poly_gt, roi_fg, resolution)
+            mask = np.array(mask > 0, dtype=np.int32)
+            masks[i, :] = np.reshape(mask, resolution**2)
+    else:
+        bg_inds = np.where(label_int32 == 0)[0]
+        rois_fg = sample_boxes[bg_inds[0]].reshape((1, -1))
+        masks = -np.ones((1, resolution**2), dtype=np.int32)
+        mask_class_labels = np.zeros((1, ))
+        roi_has_mask = np.append(roi_has_mask, 0)
+    masks = expand_mask_targets(masks, mask_class_labels, resolution,
+                                num_classes)
+    rois_fg *= im_scale
+    mask_blob['mask_rois'] = rois_fg
+    mask_blob['roi_has_mask_int32'] = roi_has_mask
+    mask_blob['mask_int32'] = masks
+    return mask_blob
+
+
+def trans_lod(lod):
+    new_lod = [0]
+    for i in range(len(lod)):
+        new_lod.append(lod[i] + new_lod[i])
+    return new_lod
+
+
+class TestGenerateMaskLabels(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_generate_proposal_labels_out()
+        self.generate_gt_polys()
+        self.generate_groundtruth()
+        self.init_test_output()
+        self.inputs = {
+            'ImInfo': self.im_info,
+            'GtClasses': (self.gt_classes.astype(np.int32), self.gt_lod),
+            'IsCrowd': (self.is_crowd.astype(np.int32), self.gt_lod),
+            'LabelsInt32': (self.label_int32.astype(np.int32), self.rois_lod),
+            'GtSegms': (self.gt_polys.astype(np.float32), self.masks_lod),
+            'Rois': (self.rois.astype(np.float32), self.rois_lod)
+        }
+        self.attrs = {
+            'num_classes': self.num_classes,
+            'resolution': self.resolution
+        }
+        self.outputs = {
+            'MaskRois': (self.mask_rois, [self.new_lod]),
+            'RoiHasMaskInt32': (self.roi_has_mask_int32, [self.new_lod]),
+            'MaskInt32': (self.mask_int32, [self.new_lod])
+        }
+
+    def init_test_case(self):
+        self.num_classes = 81
+        self.resolution = 14
+        self.batch_size = 2
+        self.batch_size_per_im = 64
+        self.images_shape = [100, 200]
+        np.random.seed(0)
+
+    def make_generate_proposal_labels_out(self):
+        rois = []
+        self.rois_lod = [[]]
+        self.label_int32 = []
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(self.batch_size_per_im)
+            for i in range(self.batch_size_per_im):
+                xywh = np.random.rand(4)
+                xy1 = xywh[0:2] * 2
+                wh = xywh[2:4] * (self.images_shape[0] - xy1)
+                xy2 = xy1 + wh
+                roi = [xy1[0], xy1[1], xy2[0], xy2[1]]
+                rois.append(roi)
+        self.rois = np.array(rois).astype("float32")
+        for idx, roi_num in enumerate(self.rois_lod[0]):
+            for roi_id in range(roi_num):
+                class_id = np.random.random_integers(self.num_classes - 1)
+                if idx == 0:
+                    # set an image with no foreground, to test the empty case
+                    self.label_int32.append(0)
+                else:
+                    self.label_int32.append(class_id)
+        label_np = np.array(self.label_int32)
+        self.label_int32 = label_np[:, np.newaxis]
+
+    def generate_gt_polys(self):
+        h, w = self.images_shape[0:2]
+        self.gt_polys = []
+        self.gt_polys_list = []
+        max_gt = 4
+        max_poly_num = 5
+        min_poly_size = 4
+        max_poly_size = 16
+        lod0 = []
+        lod1 = []
+        lod2 = []
+        for i in range(self.batch_size):
+            gt_num = np.random.randint(1, high=max_gt, size=1)[0]
+            lod0.append(gt_num)
+            ptss = []
+            for i in range(gt_num):
+                poly_num = np.random.randint(1, max_poly_num, size=1)[0]
+                lod1.append(poly_num)
+                pts = []
+                for j in range(poly_num):
+                    poly_size = np.random.randint(
+                        min_poly_size, max_poly_size, size=1)[0]
+                    x = np.random.rand(poly_size, 1) * w
+                    y = np.random.rand(poly_size, 1) * h
+                    xy = np.concatenate((x, y), axis=1)
+                    pts.append(xy.flatten().tolist())
+                    self.gt_polys.extend(xy.flatten().tolist())
+                    lod2.append(poly_size)
+                ptss.append(pts)
+            self.gt_polys_list.append(ptss)
+        self.masks_lod = [lod0, lod1, lod2]
+        self.gt_lod = [lod0]
+        self.gt_polys = np.array(self.gt_polys).astype('float32').reshape(-1, 2)
+
+    def generate_groundtruth(self):
+        self.im_info = []
+        self.gt_classes = []
+        self.is_crowd = []
+        for roi_num in self.gt_lod[0]:
+            self.im_info.append(self.images_shape + [1.0])
+            for roi_id in range(roi_num):
+                class_id = np.random.random_integers(self.num_classes - 1)
+                self.gt_classes.append(class_id)
+                self.is_crowd.append(0)
+        self.im_info = np.array(self.im_info).astype(np.float32)
+        gt_classes_np = np.array(self.gt_classes)
+        self.gt_classes = gt_classes_np[:, np.newaxis]
+        is_crowd_np = np.array(self.is_crowd)
+        self.is_crowd = is_crowd_np[:, np.newaxis]
+
+    def init_test_output(self):
+        roi_lod = trans_lod(self.rois_lod[0])
+        gt_lod = trans_lod(self.gt_lod[0])
+        outs = generate_mask_labels(self.num_classes, self.im_info,
+                                    self.gt_classes, self.is_crowd,
+                                    self.label_int32, self.gt_polys_list,
+                                    self.resolution, self.rois, roi_lod, gt_lod)
+        self.mask_rois = outs[0]
+        self.roi_has_mask_int32 = outs[1]
+        self.mask_int32 = outs[2]
+        self.new_lod = outs[3]
+
+        self.mask_rois = np.vstack(self.mask_rois)
+        self.roi_has_mask_int32 = np.hstack(self.roi_has_mask_int32)[:,
+                                                                     np.newaxis]
+        self.mask_int32 = np.vstack(self.mask_int32)
+
+    def setUp(self):
+        self.op_type = "generate_mask_labels"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index 2d5cd3b24b..5f6328707f 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 9340d55857..5ce405dcca 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index dfe4daca95..adf35c851b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -67,6 +67,18 @@ class MLP(fluid.imperative.Layer):
 
 
 class TestImperative(unittest.TestCase):
+    def test_sum_op(self):
+        x = np.ones([2, 2], np.float32)
+        with fluid.imperative.guard():
+            inputs = []
+            for _ in range(10):
+                inputs.append(fluid.imperative.base.to_variable(x))
+            ret = fluid.layers.sums(inputs)
+            loss = fluid.layers.reduce_sum(ret)
+            loss._backward()
+            self.assertTrue(np.allclose(ret._numpy(), x * 10))
+            self.assertTrue(np.allclose(inputs[0]._gradient(), x))
+
     def test_layer(self):
         with fluid.imperative.guard():
             cl = core.Layer()
@@ -133,7 +145,8 @@ class TestImperative(unittest.TestCase):
             x = fluid.layers.reduce_sum(fluid.layers.tanh(x1))
             param_grads = fluid.backward.append_backward(
                 x, parameter_list=[x1.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
             static_out, static_grad = exe.run(
                 feed={inp.name: np_inp},
@@ -160,7 +173,8 @@ class TestImperative(unittest.TestCase):
             x = l(inp)[0]
             param_grads = fluid.backward.append_backward(
                 x, parameter_list=[l._x_for_debug.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
             static_out, static_grad = exe.run(
                 feed={inp.name: np_inp},
@@ -186,7 +200,8 @@ class TestImperative(unittest.TestCase):
             out = mlp(inp)
             param_grads = fluid.backward.append_backward(
                 out, parameter_list=[mlp._fc1._w.name])[0]
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             exe.run(fluid.default_startup_program())
 
             static_out, static_grad = exe.run(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 4fe286f85e..681661bfc6 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -20,6 +20,7 @@ import sys
 
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
@@ -58,7 +59,7 @@ class Generator(fluid.imperative.Layer):
 
 
 class TestImperativeMnist(unittest.TestCase):
-    def test_mnist_cpu_float32(self):
+    def test_gan_float32(self):
         seed = 90
 
         startup = fluid.Program()
@@ -115,7 +116,8 @@ class TestImperativeMnist(unittest.TestCase):
             sgd = SGDOptimizer(learning_rate=1e-3)
             sgd.minimize(g_loss)
 
-        exe = fluid.Executor(fluid.CPUPlace())
+        exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0))
         static_params = dict()
         with fluid.scope_guard(scope):
             img = np.ones([2, 1], np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 63eeae4b71..d0a5a88317 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -145,7 +145,8 @@ class TestImperativeMnist(unittest.TestCase):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            exe = fluid.Executor(fluid.CPUPlace())
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
 
             mnist = MNIST()
             sgd = SGDOptimizer(learning_rate=1e-3)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
new file mode 100644
index 0000000000..87a72dd04e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -0,0 +1,370 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+import six
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC
+from paddle.fluid.imperative.base import to_variable
+from test_imperative_base import new_program_scope
+
+batch_size = 8
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": batch_size,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    },
+    "batch_size": batch_size,
+    "lr": 0.1,
+    "total_images": 1281164,
+}
+
+
+def optimizer_setting(params):
+    ls = params["learning_strategy"]
+    if ls["name"] == "piecewise_decay":
+        if "total_images" not in params:
+            total_images = 1281167
+        else:
+            total_images = params["total_images"]
+        batch_size = ls["batch_size"]
+        step = int(total_images / batch_size + 1)
+
+        bd = [step * e for e in ls["epochs"]]
+        base_lr = params["lr"]
+        lr = []
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+        # TODO(minqiyang): Add learning rate scheduler support to imperative mode
+        #  optimizer = fluid.optimizer.Momentum(
+    #  learning_rate=params["lr"],
+    #  learning_rate=fluid.layers.piecewise_decay(
+    #  boundaries=bd, values=lr),
+    #  momentum=0.9,
+    #  regularization=fluid.regularizer.L2Decay(1e-4))
+
+    return optimizer
+
+
+class ConvBNLayer(fluid.imperative.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=None)
+
+        self._batch_norm = BatchNorm(num_filters, act=act)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+
+        return y
+
+
+class BottleneckBlock(fluid.imperative.Layer):
+    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride)
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = fluid.layers.elementwise_add(x=short, y=conv2)
+
+        layer_helper = LayerHelper('elementwise_add_activation', act='relu')
+        return layer_helper.append_activation(y)
+
+
+class ResNet(fluid.imperative.Layer):
+    def __init__(self, layers=50, class_dim=102):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(
+            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+        self.pool2d_max = Pool2D(
+            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+        self.bottleneck_block_list = []
+        num_channels = 64
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = BottleneckBlock(
+                    num_channels=num_channels,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    shortcut=shortcut)
+                num_channels = bottleneck_block._num_channels_out
+                self.bottleneck_block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = Pool2D(
+            pool_size=7, pool_type='avg', global_pooling=True)
+
+        import math
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+
+        self.out = FC(size=class_dim,
+                      act='softmax',
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.Uniform(-stdv, stdv)))
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        y = self.pool2d_max(y)
+        for bottleneck_block in self.bottleneck_block_list:
+            y = bottleneck_block(y)
+        y = self.pool2d_avg(y)
+        y = self.out(y)
+        return y
+
+
+class TestImperativeResnet(unittest.TestCase):
+    def test_resnet_float32(self):
+        seed = 90
+
+        batch_size = train_parameters["batch_size"]
+        batch_num = 1
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            resnet = ResNet()
+            optimizer = optimizer_setting(train_parameters)
+            np.random.seed(seed)
+            import random
+            random.seed = seed
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
+
+            dy_param_init_value = {}
+            for param in fluid.default_main_program().global_block(
+            ).all_parameters():
+                dy_param_init_value[param.name] = param._numpy()
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= batch_num:
+                    break
+
+                dy_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    batch_size, 1)
+
+                img = to_variable(dy_x_data)
+                label = to_variable(y_data)
+                label._stop_gradient = True
+
+                out = resnet(img)
+                loss = fluid.layers.cross_entropy(input=out, label=label)
+                avg_loss = fluid.layers.mean(x=loss)
+
+                dy_out = avg_loss._numpy()
+
+                if batch_id == 0:
+                    for param in fluid.default_main_program().global_block(
+                    ).all_parameters():
+                        if param.name not in dy_param_init_value:
+                            dy_param_init_value[param.name] = param._numpy()
+
+                avg_loss._backward()
+
+                dy_grad_value = {}
+                for param in fluid.default_main_program().global_block(
+                ).all_parameters():
+                    if not param.stop_gradient:
+                        np_array = np.array(param._ivar._grad_ivar().value()
+                                            .get_tensor())
+                        dy_grad_value[param.name + core.grad_var_suffix(
+                        )] = np_array
+
+                optimizer.minimize(avg_loss)
+
+                dy_param_value = {}
+                for param in fluid.default_main_program().global_block(
+                ).all_parameters():
+                    dy_param_value[param.name] = param._numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+
+            resnet = ResNet()
+            optimizer = optimizer_setting(train_parameters)
+
+            np.random.seed(seed)
+            import random
+            random.seed = seed
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
+
+            img = fluid.layers.data(
+                name='pixel', shape=[3, 224, 224], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            out = resnet(img)
+            loss = fluid.layers.cross_entropy(input=out, label=label)
+            avg_loss = fluid.layers.mean(x=loss)
+            optimizer.minimize(avg_loss)
+
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            static_grad_name_list = []
+            for param in fluid.default_startup_program().global_block(
+            ).all_parameters():
+                static_param_name_list.append(param.name)
+            for param in fluid.default_main_program().global_block(
+            ).all_parameters():
+                if not param.stop_gradient:
+                    static_grad_name_list.append(param.name +
+                                                 core.grad_var_suffix())
+
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+
+            for batch_id, data in enumerate(train_reader()):
+                if batch_id >= batch_num:
+                    break
+
+                static_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    [batch_size, 1])
+
+                fetch_list = [avg_loss.name]
+                fetch_list.extend(static_param_name_list)
+                fetch_list.extend(static_grad_name_list)
+                out = exe.run(fluid.default_main_program(),
+                              feed={"pixel": static_x_data,
+                                    "label": y_data},
+                              fetch_list=fetch_list)
+
+                static_param_value = {}
+                static_grad_value = {}
+                static_out = out[0]
+                param_start_pos = 1
+                grad_start_pos = len(static_param_name_list) + param_start_pos
+                for i in range(param_start_pos,
+                               len(static_param_name_list) + param_start_pos):
+                    static_param_value[static_param_name_list[
+                        i - param_start_pos]] = out[i]
+                for i in range(grad_start_pos,
+                               len(static_grad_name_list) + grad_start_pos):
+                    static_grad_value[static_grad_name_list[
+                        i - grad_start_pos]] = out[i]
+
+        self.assertTrue(np.allclose(static_out, dy_out))
+
+        self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+
+        self.assertEqual(len(dy_grad_value), len(static_grad_value))
+        for key, value in six.iteritems(static_grad_value):
+            self.assertTrue(np.allclose(value, dy_grad_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+
+        self.assertEqual(len(dy_param_value), len(static_param_value))
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(np.isfinite(value.all()))
+            self.assertFalse(np.isnan(value.any()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index ab7183f88d..2d98b063d1 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -420,5 +420,26 @@ class TestMSRAInitializer(unittest.TestCase):
         self.assertEqual(init_op.type, 'assign_value')
 
 
+class TestNumpyArrayInitializer(unittest.TestCase):
+    def test_numpy_array_initializer(self):
+        """Test the numpy array initializer with supplied arguments
+        """
+        import numpy
+        program = framework.Program()
+        block = program.global_block()
+        np_array = numpy.random.random((10000)).astype("float32")
+        for _ in range(2):
+            block.create_parameter(
+                dtype=np_array.dtype,
+                shape=np_array.shape,
+                lod_level=0,
+                name="param",
+                initializer=initializer.NumpyArrayInitializer(np_array))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'assign_value')
+        assert (init_op.attr('fp32_values') == np_array).all()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 90f5d797a6..c13f03e86f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1023,6 +1023,14 @@ class TestBook(unittest.TestCase):
 
         print(str(program))
 
+    def test_shuffle_channel(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
+            out = layers.shuffle_channel(x, group=4)
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
index 8c9e489e02..7e1c2572f0 100644
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -111,7 +111,7 @@ class TestPassBuilder(unittest.TestCase):
 
         pass_builder.remove_pass(len(pass_builder.all_passes()) - 1)
         self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
-        viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass")
+        viz_pass.set("graph_viz_path", "/tmp/test_viz_pass")
 
         self.check_network_convergence(
             use_cuda=core.is_compiled_with_cuda(),
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index ffd4026dba..d33a57f675 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -81,11 +81,10 @@ class TestSequenceExpand(OpTest):
 class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[2, 3]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
         y_lod = [[2, 3], [2, 2, 3, 3, 3]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
-        self.attrs = {'ref_level': 0}
+        self.attrs = {'ref_level': 1}
 
 
 class TestSequenceExpandCase2(TestSequenceExpand):
diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
new file mode 100644
index 0000000000..aeaae90581
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+class TestShuffleChannelOp(OpTest):
+    def setUp(self):
+        self.op_type = "shuffle_channel"
+        self.batch_size = 10
+        self.input_channels = 16
+        self.layer_h = 4
+        self.layer_w = 4
+        self.group = 4
+        self.x = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_h,
+             self.layer_w)).astype('float32')
+        self.inputs = {'X': self.x}
+        self.attrs = {'group': self.group}
+        n, c, h, w = self.x.shape
+        input_reshaped = np.reshape(self.x,
+                                    (-1, self.group, c // self.group, h, w))
+        input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4))
+        self.outputs = {'Out': np.reshape(input_transposed, (-1, c, h, w))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index 41797a241c..ae1883f1f7 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -18,6 +18,7 @@ import numpy as np
 from op_test import OpTest
 from scipy.special import logit
 from scipy.special import expit
+import paddle.fluid.core as core
 import unittest
 
 
@@ -117,5 +118,36 @@ class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSigmoidCrossEntropyWithNorm(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+        self.attrs = {'ignore_index': ignore_index, 'normalize': True}
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        if self.attrs['normalize']:
+            out = out / float(
+                np.where(self.inputs['Label'] != ignore_index)[0].size)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py
index 6b78ceeaee..89dd4dd50b 100644
--- a/python/paddle/fluid/transpiler/details/checkport.py
+++ b/python/paddle/fluid/transpiler/details/checkport.py
@@ -16,6 +16,7 @@ import sys
 import time
 import socket
 from contextlib import closing
+from six import string_types
 
 
 def wait_server_ready(endpoints):
@@ -32,6 +33,7 @@ def wait_server_ready(endpoints):
 
            wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
     """
+    assert not isinstance(endpoints, string_types)
     while True:
         all_ok = True
         not_ready_endpoints = []
@@ -45,7 +47,7 @@ def wait_server_ready(endpoints):
                     all_ok = False
                     not_ready_endpoints.append(ep)
         if not all_ok:
-            sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
+            sys.stderr.write("server not ready, wait 3 sec to retry...\n")
             sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
                              "\n")
             sys.stderr.flush()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index ea5a4cf7cd..e58f34e375 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -39,7 +39,7 @@ from .ps_dispatcher import RoundRobin, PSDispatcher
 from .. import core, framework, unique_name
 from ..framework import Program, default_main_program, \
     default_startup_program, Block, \
-    Parameter, grad_var_name
+    Parameter, Variable, grad_var_name
 from .details import *
 from ..distribute_lookup_table import find_distributed_lookup_table
 from functools import reduce
@@ -62,6 +62,260 @@ def log(*args):
         print(args)
 
 
+class VarStruct(object):
+    """
+    record part properties of a Variable in python.
+    """
+
+    def __init__(self, name, shape, dtype, type, lod_level, persistable):
+        self.name = name
+        self.shape = shape
+        self.dtype = dtype
+        self.type = type
+        self.lod_level = lod_level
+        self.persistable = persistable
+
+
+class VarDistributed(object):
+    """
+    a class to record the var distributed on parameter servers.
+    the class will record the relationship between origin var and slice var.
+    the slice var's properties, such as type/shape/offset/endpoint.
+    """
+
+    def __init__(self,
+                 origin_var,
+                 slice_var,
+                 is_slice=None,
+                 block_id=None,
+                 offset=None,
+                 vtype=None,
+                 endpoint=None):
+        """
+        Args:
+            origin_var(Variable|VarStruct): origin var properties
+            slice_var(Variable|VarStruct): slice var properties
+            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
+            block_id(int|None): the number about the slice var.
+            offset(int|None): if the slice var is sliced, offset is the numel before the var.
+            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
+            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
+        """
+
+        if isinstance(origin_var, Variable):
+            self.origin = self.__create_var_struct(origin_var)
+        else:
+            self.origin = origin_var
+
+        if isinstance(slice_var, Variable):
+            self.slice = self.__create_var_struct(slice_var)
+        else:
+            self.slice = slice_var
+
+        if self.equal(self.origin, self.slice):
+            self.is_slice = False
+            self.block_id = 0
+            self.offset = 0
+        else:
+            self.is_slice = True
+            self.block_id = 0
+            self.offset = 0
+
+        if is_slice is not None:
+            self.is_slice = is_slice
+        if block_id is not None:
+            self.block_id = block_id
+        if offset is not None:
+            self.offset = offset
+
+        self.vtype = vtype
+        self.endpoint = endpoint
+
+    @staticmethod
+    def __create_var_struct(var):
+        return VarStruct(var.name, var.shape, var.dtype, var.type,
+                         var.lod_level, var.persistable)
+
+    @staticmethod
+    def equal(var1, var2):
+        """
+        the two var is equal or not.
+        Returns:
+            bool: equal will return True else False
+        """
+        assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct)
+
+        return var1.name == var2.name and \
+               var1.type == var2.type and \
+               var1.shape == var2.shape and \
+               var1.dtype == var2.dtype and \
+               var1.lod_level == var2.lod_level and \
+               var1.persistable == var2.persistable
+
+    def __str__(self):
+        origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \
+            format(i="{", e="}", name=self.origin.name, type=self.origin.type,
+                   shape=self.origin.shape, dtype=self.origin.dtype)
+
+        slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \
+                        ".slice({is_slice}).block({block_id}).offset({offset})". \
+            format(i="{", e="}", name=self.slice.name, type=self.slice.type,
+                   shape=self.slice.shape, dtype=self.slice.dtype,
+                   is_slice=self.is_slice, block_id=self.block_id, offset=self.offset)
+
+        return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
+            self.vtype, origin_var_str, slice_var_str, self.endpoint)
+
+
+class VarsDistributed(object):
+    """
+    a gather about VarDistributed with many methods to find distributed vars.
+    through the class, we can get overview about the distributed parameters on parameter servers.
+    this class may centralized and convenient for developer to manage and get variable's distribute.
+    other module can also use this to find variables such io.py.
+    """
+
+    def __init__(self):
+        self.distributed_vars = []
+
+    def add_distributed_var(self,
+                            origin_var,
+                            slice_var,
+                            is_slice=None,
+                            block_id=None,
+                            offset=None,
+                            vtype=None,
+                            endpoint=None):
+        """
+        add distributed var in this.
+
+        Args:
+            origin_var(Variable|VarStruct): origin var properties
+            slice_var(Variable|VarStruct): slice var properties
+            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
+            block_id(int|None): the number about the slice var.
+            offset(int|None): if the slice var is sliced, offset is the numel before the var.
+            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
+            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
+        Returns:
+            None
+        """
+        self.distributed_vars.append(
+            VarDistributed(origin_var, slice_var, is_slice, block_id, offset,
+                           vtype, endpoint))
+
+    def get_distributed_var_by_slice(self, var_name):
+        """
+        get distributed var by conditions.
+
+        Args:
+            var_name(str): slice var name, such as "w.traier0.block1"
+        Returns:
+            VarDistributed: distributed var.
+        """
+        for dist_var in self.distributed_vars:
+            if dist_var.slice.name == var_name:
+                return dist_var
+        return None
+
+    @staticmethod
+    def equal(var1, var2):
+        """
+        the two var is equal or not.
+        Returns:
+            bool: equal will return True else False
+        """
+        return var1.name == var2.name and \
+               var1.type == var2.type and \
+               var1.shape == var2.shape and \
+               var1.dtype == var2.dtype and \
+               var1.lod_level == var2.lod_level and \
+               var1.persistable == var2.persistable
+
+    def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint):
+        """
+        get distributed var by conditions.
+
+        Args:
+            origin_var_name(str):
+            endpoint(str): the parameter endpoint, such as "127.0.0.1:1001"
+        Returns:
+            VarDistributed: distributed var.
+        """
+        for dist_var in self.distributed_vars:
+            if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint:
+                return dist_var
+        return None
+
+    def get_distributed_vars_by_vtypes(self, vtypes, groupby=False):
+        """
+        get distributed vars by conditions.
+
+        Args:
+            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
+            groupby(bool|False): group by origin var or not.
+
+        Returns:
+            list: distributed var list.
+            dict: distributed var map when groupby=True
+        """
+        vtype_vars = []
+        for var in self.distributed_vars:
+            if var.vtype in vtypes:
+                vtype_vars.append(var)
+        if not groupby:
+            return vtype_vars
+
+        params_map = {}
+        for var in vtype_vars:
+            origin_var_name = var.origin.name
+
+            if origin_var_name in params_map.keys():
+                optimizers = params_map.get(origin_var_name)
+            else:
+                optimizers = []
+            optimizers.append(var)
+            params_map[origin_var_name] = optimizers
+        return params_map
+
+    def get_distributed_vars_by_ep(self, endpoint, vtype=None):
+        """
+        get distributed vars by conditions.
+
+        Args:
+            endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001"
+            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
+
+        Returns:
+            list: distributed var list.
+        """
+        endpoint_vars = []
+        for var in self.distributed_vars:
+            if var.endpoint == endpoint:
+                endpoint_vars.append(var)
+        if not vtype:
+            return endpoint_vars
+
+        vtype_vars = []
+        for var in endpoint_vars:
+            if var.vtype == vtype:
+                vtype_vars.append(var)
+        return vtype_vars
+
+    def overview(self):
+        """
+        get the overview string about all params on all parameter servers.
+
+        Returns:
+            Str: overview string.
+
+        """
+        vars_str = []
+        for var in self.distributed_vars:
+            vars_str.append(str(var))
+        return "\n".join(vars_str)
+
+
 class VarBlock:
     def __init__(self, varname, offset, size):
         self.varname = varname
@@ -327,6 +581,7 @@ class DistributeTranspiler(object):
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
         self.pserver_endpoints = pserver_endpoints
+        self.vars_overview = VarsDistributed()
         self.optimize_ops, self.params_grads = self._get_optimize_pass()
 
         ps_dispatcher = self.config.split_method(self.pserver_endpoints)
@@ -347,6 +602,7 @@ class DistributeTranspiler(object):
         # add distributed attrs to program
         self.origin_program._is_distributed = True
         self.origin_program._endpoints = self.pserver_endpoints
+        self.origin_program._ps_endpoint = current_endpoint
         self.origin_program._is_chief = self.trainer_id == 0
         self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None
 
@@ -454,6 +710,10 @@ class DistributeTranspiler(object):
             self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
             self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
 
+            distributed_var = self.vars_overview.get_distributed_var_by_slice(
+                recv_vars[i].name)
+            distributed_var.endpoint = ep
+
         # step4: Concat the parameters splits together after recv.
         all_recv_outputs = []
         for param_varname, splited_var in six.iteritems(self.param_var_mapping):
@@ -480,6 +740,12 @@ class DistributeTranspiler(object):
                 recv_op_role_var_name = splited_trainer_grad[0].name
 
             if param_varname in self.sparse_param_to_height_sections:
+
+                for table_name in table_names:
+                    distributed_var = self.vars_overview.get_distributed_var_by_slice(
+                        table_name)
+                    distributed_var.vtype = "RemotePrefetch"
+
                 height_sections = self.sparse_param_to_height_sections[
                     param_varname]
                 self._update_remote_sparse_update_op(
@@ -532,6 +798,9 @@ class DistributeTranspiler(object):
                                                         pserver_endpoints)
             self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
+        self._get_distributed_optimizer_vars()
+        self.origin_program._parameters_on_pservers = self.vars_overview
+
     def get_trainer_program(self, wait_port=True):
         """
         Get transpiled trainer side program.
@@ -541,6 +810,7 @@ class DistributeTranspiler(object):
         """
         # remove optimize ops and add a send op to main_program
         # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
+
         lr_ops = self._get_lr_ops()
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         delete_ops(self.origin_program.global_block(), lr_ops)
@@ -665,9 +935,14 @@ class DistributeTranspiler(object):
         # NOTE: assume blocks of the same variable is not distributed
         # on the same pserver, only change param/grad varnames for
         # trainers to fetch.
+        sys.stderr.write(
+            "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n"
+        )
         # step1
         pserver_program = Program()
         pserver_program.random_seed = self.origin_program.random_seed
+        pserver_program._copy_dist_param_info_from(self.origin_program)
+
         # step2: Create vars to receive vars at parameter servers.
         recv_inputs = []
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
@@ -703,9 +978,6 @@ class DistributeTranspiler(object):
             else:
                 recv_inputs.append(single_trainer_var)
 
-        self._slice_params_and_optimizes = self._get_slice_vars_and_attrs(
-            endpoint)
-
         # step 3
         # Create a union-find data structure from optimize ops,
         # If two ops are connected, we could add these two ops
@@ -882,10 +1154,6 @@ class DistributeTranspiler(object):
             outputs={},
             attrs=attrs)
 
-        # add distributed attrs
-        pserver_program._slice_vars_and_attrs = list(
-            self._slice_params_and_optimizes.values())
-
         pserver_program._sync_with_cpp()
         # save pserver program to generate pserver side startup relatively.
         self.pserver_program = pserver_program
@@ -984,30 +1252,88 @@ class DistributeTranspiler(object):
                     inputs={"X": startup_param_var},
                     outputs={"Out": startup_tmpvar})
 
-        # add slice vars
-        s_prog._slice_vars_and_attrs = pserver_program._slice_vars_and_attrs
-
         return s_prog
 
-    def _get_slice_vars_and_attrs(self, endpoint):
-        slice_vars_and_attrs = {}
+    # ====================== private transpiler functions =====================
+    def _get_slice_var_info(self, slice_var):
         block_suffix = "block"
-        for param in self.param_grad_ep_mapping[endpoint]["params"]:
-            orig_var_name, block_name, _ = self._get_varname_parts(param.name)
-            if not block_name:
-                continue
+        block_idx = 0
+        offset = 0
+        is_slice = False
 
-            block_idx = int(block_name.split(block_suffix)[1])
-            orig_var = self.origin_program.global_block().vars[orig_var_name]
+        orig_var_name, block_name, _ = self._get_varname_parts(slice_var.name)
 
-            skip_dim0 = 0
-            slice_vars = self.param_var_mapping[orig_var_name]
-            for slice_var in slice_vars[:block_idx]:
-                skip_dim0 += slice_var.shape[0]
-            slice_vars_and_attrs[param.name] = [orig_var, skip_dim0, param]
-        return slice_vars_and_attrs
+        if not block_name:
+            return is_slice, block_idx, offset
 
-    # ====================== private transpiler functions =====================
+        block_idx = int(block_name.split(block_suffix)[1])
+        skip_dim0 = 0
+        slice_vars = self.param_var_mapping[orig_var_name]
+
+        orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:])
+
+        for slice_var in slice_vars[:block_idx]:
+            skip_dim0 += slice_var.shape[0]
+
+        offset = skip_dim0 * orig_dim1_flatten
+        is_slice = True
+        return is_slice, block_idx, offset
+
+    def _get_distributed_optimizer_vars(self):
+        def _get_distributed_optimizer_var(endpoint):
+            opt_op_on_pserver = []
+            for _, op in enumerate(self.optimize_ops):
+                if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
+                        endpoint, op):
+                    opt_op_on_pserver.append(op)
+
+            for opt_op in opt_op_on_pserver:
+                dist_var = None
+                for key in opt_op.input_names:
+                    if key == "Param":
+                        param_name = opt_op.input(key)[0]
+                        dist_var = self.vars_overview.get_distributed_var_by_origin_and_ep(
+                            param_name, endpoint)
+                        break
+                for key in opt_op.input_names:
+                    if key in ["Param", "Grad", "LearningRate"]:
+                        continue
+                    origin_var = self.origin_program.global_block().vars[
+                        opt_op.input(key)[0]]
+                    # update accumulator variable shape
+                    new_shape = self._get_optimizer_input_shape(
+                        opt_op.type, key, origin_var.shape,
+                        dist_var.slice.shape)
+
+                    if new_shape == dist_var.slice.shape:
+                        splited_var = VarStruct(
+                            name=origin_var.name,
+                            shape=new_shape,
+                            dtype=origin_var.dtype,
+                            type=origin_var.type,
+                            lod_level=origin_var.lod_level,
+                            persistable=origin_var.persistable)
+
+                        self.vars_overview.add_distributed_var(
+                            origin_var=origin_var,
+                            slice_var=splited_var,
+                            is_slice=dist_var.is_slice,
+                            block_id=dist_var.block_id,
+                            offset=dist_var.offset,
+                            vtype="Optimizer",
+                            endpoint=endpoint)
+                    else:
+                        self.vars_overview.add_distributed_var(
+                            origin_var=origin_var,
+                            slice_var=origin_var,
+                            is_slice=False,
+                            block_id=0,
+                            offset=0,
+                            vtype="Optimizer",
+                            endpoint=endpoint)
+
+        for ep in self.pserver_endpoints:
+            _get_distributed_optimizer_var(ep)
 
     def _update_dist_lookup_table_vars(self, param_list, grad_list,
                                        params_grads):
@@ -1093,6 +1419,22 @@ class DistributeTranspiler(object):
         # origin_param_name -> [splited_param_vars]
         self.param_var_mapping = self._create_vars_from_blocklist(
             self.origin_program, param_blocks)
+
+        for orig_name, splited_vars in self.param_var_mapping.items():
+            orig_var = self.origin_program.global_block().var(orig_name)
+
+            for splited_var in splited_vars:
+                is_slice, block_id, offset = self._get_slice_var_info(
+                    splited_var)
+
+                self.vars_overview.add_distributed_var(
+                    origin_var=orig_var,
+                    slice_var=splited_var,
+                    block_id=block_id,
+                    offset=offset,
+                    is_slice=is_slice,
+                    vtype="Param")
+
         # origin_grad_name -> [splited_grad_vars]
         self.grad_var_mapping = self._create_vars_from_blocklist(
             self.origin_program,
@@ -1729,13 +2071,6 @@ class DistributeTranspiler(object):
                 shape=new_shape)
             new_inputs[key] = tmpvar
 
-            # var shape been changed
-            if new_shape != var.shape:
-                slice_var_args = self._slice_params_and_optimizes[
-                    param_var.name]
-                self._slice_params_and_optimizes[
-                    var.name] = [var, slice_var_args[1], tmpvar]
-
         # change output's ParamOut variable
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
@@ -1763,8 +2098,8 @@ class DistributeTranspiler(object):
                 # skip per trainer vars
                 if g.name.find(".trainer_") == -1:
                     # only param or grads have splited blocks
-                    if self._orig_varname(g.name) in self.grad_name_to_param_name or\
-                        self._orig_varname(g.name) in self.param_name_to_grad_name:
+                    if self._orig_varname(g.name) in self.grad_name_to_param_name or \
+                            self._orig_varname(g.name) in self.param_name_to_grad_name:
                         grad_block = g
                         break
         return grad_block
diff --git a/python/setup.py.in b/python/setup.py.in
index e00c88b3a6..c947785cbf 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -109,10 +109,12 @@ packages=['paddle',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
+          'paddle.fluid.contrib.reader',
           'paddle.fluid.contrib.slim',
           'paddle.fluid.contrib.slim.core',
           'paddle.fluid.contrib.slim.graph',
           'paddle.fluid.contrib.slim.prune',
+          'paddle.fluid.contrib.slim.quantization',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']