diff --git a/.travis.yml b/.travis.yml
index a406841f6a..361136ac2c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -27,15 +27,6 @@ script:
     # 43min timeout
     paddle/scripts/paddle_docker_build.sh ${JOB}
     if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
-  - |
-    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
-    # For document only
-    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
-    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
-    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
-    export DOCS_DIR=`pwd`
-    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
   email:
     on_success: change
diff --git a/AUTHORS.md b/AUTHORS.md
index 8c4a113fc2..41b7193677 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -46,6 +46,7 @@
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
+| velconia | Qi-Yang Min |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
 | wangzhen-nlp | Zhen Wang |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 231224f924..f56c5d382a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,6 +65,7 @@ option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better d
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 
@@ -72,6 +73,7 @@ option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VER
 if(NOT PY_VERSION)
   set(PY_VERSION 2.7)
 endif()
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -158,6 +160,7 @@ endif()
 ########################################################################################
 
 include(external/mklml)     # download mklml package
+include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
@@ -174,6 +177,7 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
+include(external/cub)
 
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
@@ -200,6 +204,14 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 
+if(WITH_GPU)
+    include(cuda)
+    include(tensorrt)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()
+
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
@@ -228,14 +240,6 @@ set(EXTERNAL_LIBS
     ${PYTHON_LIBRARIES}
 )
 
-if(WITH_GPU)
-    include(cuda)
-    include(tensorrt)
-    include(external/anakin)
-else()
-  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
-endif()
-
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index f8aed5a5e0..6b22f8f520 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -85,8 +85,7 @@ def dist_transpile(trainer_id, args):
         trainer_id,
         pservers=pserver_endpoints,
         trainers=trainers,
-        sync_mode=not args.async_mode,
-        slice_var_up=not args.no_split_var)
+        sync_mode=not args.async_mode)
     if training_role == "PSERVER":
         pserver_program = t.get_pserver_program(current_endpoint)
         pserver_startup_program = t.get_startup_program(current_endpoint,
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 2c84061ff5..9eebea816c 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
     ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index fb3d8ef8d5..8b7d91f234 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -8,6 +8,7 @@ set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header f
 set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
 
 set(ANAKIN_COMPILE_EXTRA_FLAGS 
+    -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
     -Wno-error=unused-variable -Wno-unused-variable 
     -Wno-error=format-extra-args -Wno-format-extra-args
     -Wno-error=comment -Wno-comment 
@@ -19,7 +20,7 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
     -Wno-reorder 
     -Wno-error=cpp)
 
-set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz")
 
 # A helper function used in Anakin, currently, to use it, one need to recursively include
 # nearly all the header files.
@@ -41,9 +42,9 @@ if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
     message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
     execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
     execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget --no-check-certificate -q ${ANAKIN_LIBRARY_URL}")
     execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin.tar.gz")
 endif()
 
 if (WITH_ANAKIN)
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
new file mode 100644
index 0000000000..c94849cf4b
--- /dev/null
+++ b/cmake/external/cub.cmake
@@ -0,0 +1,35 @@
+if(NOT WITH_GPU)
+  return()
+endif()
+
+include(ExternalProject)
+
+set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub)
+set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub)
+
+include_directories(${CUB_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_cub
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/NVlabs/cub.git"
+  GIT_TAG        "v1.8.0"
+  PREFIX         ${CUB_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(cub STATIC ${dummyfile})
+else()
+  add_library(cub INTERFACE)
+endif()
+
+add_dependencies(cub extern_cub)
+
+LIST(APPEND externl_project_dependencies cub)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 82437a8424..7fb67afbe1 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -50,7 +50,7 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
     BUILD_IN_SOURCE 1
-    PATCH_COMMAND git apply ${PADDLE_SOURCE_DIR}/patches/grpc/fix_too_early_destory.patch
+    PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h
     # NOTE(yuyang18):
     # Disable -Werror, otherwise the compile will fail in MacOS.
     # It seems that we cannot configure that by make command.
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 20dda35c5c..260985cc8a 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -24,7 +24,7 @@ SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 
 IF(WIN32 OR APPLE)
-    MESSAGE(WARNING 
+    MESSAGE(WARNING
         "Windows or Mac is not supported with MKLDNN in Paddle yet."
         "Force WITH_MKLDNN=OFF")
     SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
@@ -57,8 +57,10 @@ ExternalProject_Add(
     GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} 
+    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
     CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
     CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
     CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
new file mode 100644
index 0000000000..384c2f9328
--- /dev/null
+++ b/cmake/external/xbyak.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(WITH_XBYAK ON)
+if(WIN32 OR APPLE)
+    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
+    return()
+endif()
+
+include(ExternalProject)
+
+set(XBYAK_PROJECT       extern_xbyak)
+set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
+set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
+set(XBYAK_INC_DIR       ${XBYAK_INSTALL_ROOT}/include)
+
+include_directories(${XBYAK_INC_DIR})
+include_directories(${XBYAK_INC_DIR}/xbyak)
+
+add_definitions(-DPADDLE_WITH_XBYAK)
+
+# xbyak options
+add_definitions(-DXBYAK64)
+add_definitions(-DXBYAK_NO_OP_NAMES)
+
+ExternalProject_Add(
+    ${XBYAK_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ""
+    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
+    GIT_TAG             "v5.661"  # Jul 26th
+    PREFIX              ${XBYAK_PREFIX_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
+    add_library(xbyak STATIC ${dummyfile})
+else()
+    add_library(xbyak INTERFACE)
+endif()
+
+add_dependencies(xbyak ${XBYAK_PROJECT})
+list(APPEND external_project_dependencies xbyak)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index eafb11b6f2..82c958073c 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -263,8 +263,11 @@ function(cc_test TARGET_NAME)
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     endif()
   endif()
 endfunction(cc_test)
@@ -328,8 +331,11 @@ function(nv_test TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     endif()
   endif()
 endfunction(nv_test)
@@ -577,7 +583,9 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+             FLAGS_cpu_deterministic=true
+             PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index e2c58cd560..aeb081e76e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -148,18 +148,11 @@ if (WITH_ANAKIN AND WITH_GPU)
      list(APPEND inference_deps anakin_inference_lib)
 endif()
 
-copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared
-  SRCS ${src_dir}/${module}/paddle_inference_api.h 
-       ${src_dir}/${module}/demo_ci
-       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api*
-  DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference
-)
-list(APPEND inference_deps inference_api_lib)
-
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
 set(module "platform")
diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md
deleted file mode 100644
index a141dcbca5..0000000000
--- a/doc/fluid/design/ir/draft.md
+++ /dev/null
@@ -1,89 +0,0 @@
-## Motivation
-
-There is a ```gap``` between the ```Program``` defined by
-user and the ```Executable``` that can be scheduled
-efficiently on heterogeneous hardware, either locally
-or distributedly.
-
-Usually, the ```gap``` is bridged by
-
-* A serious transformations with defined order.
-
-* These transformations usually involve
-```insert, delete, clustering, split, dependency analysis```.
-
-* Has a simple way to verify and debug each transformation.
-
-* Flexible to add, remove or customize transformations to fit
-the requirements of various algorithms (models) and hardware secenarios.
-
-Some other events also push us to a better unified pattern.
-
-* The deep learning framework is built around the concepts of graphs.
-To leverage tools such as compilation (e.g. TVM and nGraph) or
-cross-framework conversion (e.g. ONNX), we also need a intermediate
-representation that can be connected to the rest of the ecosystem.
-
-
-We need a unified pattern to naturally support the requirements
-described above. The pattern should fit both training, inference
-and other offline serielized model transformations.
-Learned from LLVM and other deep learning framework, we draft the
-design below.
-
-
-## Design
-
-### Major Concepts
-
-#### Node
-
-```Node``` represents an operation that performs some computation or
-a variable that is input or output of operation.
-
-```Node```s are connected to other ```Node```s via inputs and outputs.
-
-Other properties (maybe device placement information) can be added
-to ```Node``` in the future if it's a
-common requirement of many other ```Pass```es. Otherwise, it should live
-in a ```Node``` wrapper class that is private to some ```Pass``` or be
-a local member of a ```Pass```.
-
-#### Graph
-
-```Graph``` contains a list of ```Node```s, which are connected to
-each other via inputs and outputs.
-
-TODO: Better definitions for the graph.
-
-```Graph``` can also contain ```Attribute```s. ```Attribute```s
-can be ``any`` thing. For example, it can be a list of "wraper"
-nodes. The ```wrapper``` nodes compose ```Node```s and provide
-helper method for execution or transformation. ```Attribute```
-can also contain other things that describe some properties of
-the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed
-across ```Pass```. However, it should be used with care.
-
-#### Pass
-
-```Pass``` represents a transformation of ```Graph```. Its input
-is a ```Graph``` and its output is also a ```Graph```. For example,
-a ```Pass``` can simply print out the ```Graph```. A ```Pass```
-can also fuse some ```Graph```'s ```Node```s.
-
-#### Optimize
-
-```Optimize``` contains a series of ```Pass``` with defined order.
-```Optimize``` transforms a ```Graph``` that only contains raw
-modeling logic to a ```Graph``` that can be run efficiently while
-maintaining the original modeling logic.
-
-
-### Optimize Process
-
-* Program is first converted to Graph.
-* Graph goes through a series of Pass
-* Graph is transformed from raw model logic to a
-form that is efficient to execute.
-
-Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
diff --git a/doc/fluid/design/ir/overview.md b/doc/fluid/design/ir/overview.md
new file mode 100644
index 0000000000..83ef97c99e
--- /dev/null
+++ b/doc/fluid/design/ir/overview.md
@@ -0,0 +1,185 @@
+## Motivation
+
+There is a `gap` between the `Program` defined by
+user and the `Executable` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the `gap` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+`insert, delete, clustering, split, dependency analysis`.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+`Node` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+`Node`s are connected to other `Node`s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to `Node` in the future if it's a
+common requirement of many other `Pass`es. Otherwise, it should live
+in a `Node` wrapper class that is private to some `Pass` or be
+a local member of a `Pass`.
+
+#### Graph
+
+`Graph` contains a list of `Node`s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+`Graph` can also contain `Attribute`s. `Attribute`s
+can be `any` thing. For example, it can be a list of "wraper"
+nodes. The `wrapper` nodes compose `Node`s and provide
+helper method for execution or transformation. `Attribute`
+can also contain other things that describe some properties of
+the `Graph` or `Graph` nodes. `Attribute` can be passed
+across `Pass`. However, it should be used with care.
+
+```cpp
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc &program);
+
+  bool Has(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr);
+  const std::unordered_set<ir::Node *> &Nodes() const;
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc);
+
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc);
+
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar();
+
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type);
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes();
+};
+```
+
+#### Pass
+
+`Pass` represents a transformation of `Graph`. Its input
+is a `Graph` and its output is also a `Graph`. For example,
+a `Pass` can simply print out the `Graph`. A `Pass`
+can also fuse some `Graph`'s `Node`s.
+
+```cpp
+class Pass {
+ public:
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const {
+    // Some correctness check.
+    auto new_graph = ApplyImpl(std::move(graph));
+    // Some correctness check.
+    return new_graph;
+  }
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const;
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) ;
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr);
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const = 0;
+};
+
+// In my_pass.cc
+class MyPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override {
+    // do something.
+    return graph;
+  }
+}
+REGISTER_PASS(my_pass, MyPass)
+.RequirePassAttr("places")
+.RequireGraphAttr("dep_vars");
+
+
+// To use the pass.
+auto my_pass = ir::PassRegistry::Instance().Get("my_pass");
+graph = my_pass->Apply(std::move(graph));
+// Note: to force link my_pass.cc, in the code:
+USE_PASS(my_pass);
+```
+
+#### Optimize
+
+`Optimize` contains a series of `Pass` with defined order.
+`Optimize` transforms a `Graph` that only contains raw
+modeling logic to a `Graph` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+```
+// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
+auto graph = Graph(program);
+graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah));
+// For more complex Pass, Optimize Process can provide Pass attributes.
+auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass");
+mem_opt_pass.SetNotOwned<int>("optimize_level", 1);
+mem_opt_pass->Apply(std::move(graph));
+graph = PassRegistry::Instance().Get("multi_devices_pass").Apply(std::move(grah));
+graph = PassRegistry::Instance().Get("multi_devices_check_pass").Apply(std::move(grah));
+Executor exe;
+exe.Run(graph);
+
+```
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
index 5d061e1c00..faf39f276d 100644
--- a/doc/fluid/howto/optimization/timeline_cn.md
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -1,21 +1,27 @@
 # 如何使用timeline工具做性能分析
 
-1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
 
 	**提示：**
 	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
 
 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[])
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```
 
 1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
 
 1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
 
diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md
index 96481ae2a6..6f963c6b4d 100644
--- a/doc/fluid/howto/optimization/timeline_en.md
+++ b/doc/fluid/howto/optimization/timeline_en.md
@@ -1,15 +1,17 @@
 # how to use timeline tool to do profile
 
-1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
 
 	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[],
-	                    use_program_cache=True)
+    for pass_id in range(pass_num):
+        for batch_id, data in enumerate(train_reader()):
+            if pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile")
+            exe.run(fluid.default_main_program(),
+                    feed=feeder.feed(data),
+                    fetch_list=[])
 	            ...
 	```
 
@@ -17,6 +19,10 @@
 file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
 [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
 
+```python
+python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
+```
+
 1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
 
 	![chrome tracing](./tracing.jpeg)
diff --git a/doc/survey/op_fusion_design.md b/doc/survey/op_fusion_design.md
new file mode 100644
index 0000000000..d6e48f4f58
--- /dev/null
+++ b/doc/survey/op_fusion_design.md
@@ -0,0 +1,20 @@
+# Operator fusion  
+Fusing multiple operators together is an important method to optimize the program execution, particularly for GPU or other specialized accelerators. An obvious benefit is to avoid the overhead of saving the intermediate result back into global memory.   
+
+There are generally two ways to fuse operators, fusing directly connected operators and fusing non directly connected operators. The first method is mainly used by [NNVM Compiler](https://github.com/dmlc/tvm/) and [XLA](https://www.tensorflow.org/performance/xla/). The second method is mainly used by Dynet and TensorFlow Fold to do auto-batching. The principle of fusing operator is according to some rules to combine multiple operations into one, for example, `Y = X * W` and `Z = Y + B` can be fused to `Z = X * W + B`, and `Y1 = X1 * W` and `Y2 = X2 * W` can be fused to `[Y1;Y2] = [X1;X2] * W`. In order to get a short-term profit, we decided to try to manually specify these rules.   
+
+## Challenge
+The challenge of fusing operators is:
+  - how to make the rules.
+  - how to implement these rules efficiently.
+
+### How to make the rules?
+
+The problem of determining the best single location for a fusion operator is an NP-hard combinatorial problem. After analysis the operators of the DL model, we found there are two group of operators can be fused explicitly, one is the simple and adjacent operations, for example, `tmp = x + y` and `z = Relu(tmp)`, and the other is the operators that have the same function, for example, a serials of `SGD` or `Momentum`. They usually appear in the model in a large number. So we should think about how to fuse them separately first.
+
+### How to implement these rules efficiently?
+#### How to fuse the adjacent operations efficiently?
+Here we use a template function to represent the fused operations. The pros of using a template function are that it is simple and efficient, and the cons are that it is not easy to expand, and it can only be used to express some simple operations. So taking into account our current needs, the template function is more appropriate.
+
+#### How to fuse the operators that have the same function efficiently?
+We take SGD operator as an example, the training model may have hundreds of parameters and correspondingly have the same number of SGD operators. The expression(`w = w - lr*w_g`) of those operators is the same, so during of training, the executor will execute this expression hundreds time in CPU or other specialized accelerators. If we can fuse them and make the address of all `w` and all `w_g` continuous respectively, we only need execute one time. For some accelerators, the time of launching kernel is not neglected, so the time of hundreds of times of launching and executing kernel may be larger than launching and executing only once. There usually are many operators that similar to `SGD` in the DL model, such as `AllReduce` and `FC`.
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 6efb03dabe..dd172ff9c9 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -170,6 +170,7 @@ paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], var
 paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -201,7 +202,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -225,17 +225,14 @@ paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=
 paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
 paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
-paddle.fluid.layers.StaticRNN.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None))
-paddle.fluid.layers.ParallelDo.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -266,9 +263,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='ar
 paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.logsigmoid ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
@@ -309,7 +304,9 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
 paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
 paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
 paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
+paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
+paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -339,6 +336,7 @@ paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=Non
 paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
 paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index d274d96c29..2577e59d9c 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -5,5 +5,7 @@ add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(string)
 add_subdirectory(recordio)
-# NOTE: please add subdirectory inference at last.
-add_subdirectory(inference)
+if(WITH_INFERENCE)
+  # NOTE: please add subdirectory inference at last.
+  add_subdirectory(inference)
+endif()
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index de06c860f5..1d62792b80 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -7,10 +7,11 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
+cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -22,7 +23,12 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
+if(WITH_GPU)
+  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+else()
+  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+endif()
+
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@@ -94,7 +100,7 @@ else()
 endif()
 
 
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@@ -105,7 +111,7 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-      
+
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index ce48548418..960ca39e1e 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -88,9 +88,8 @@ class BlockDesc {
   OpDesc *InsertOp(size_t index);
 
   /*
-   * Remove Op and its input/output variables.
-   * Note that for either input or output variable, if it is also an input or
-   * output variable of other ops, we should remain it.
+   * Only remove op itself,
+   * do nothing to its input and output variables
    */
   void RemoveOp(size_t s, size_t e);
 
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 60382faffb..1a9ce746ea 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <unordered_map>
 
+using float16 = paddle::platform::float16;
+
 namespace paddle {
 namespace framework {
 
@@ -53,7 +55,7 @@ static DataTypeMap* InitDataTypeMap() {
   RegisterType<cc_type>(retv, proto_type, #cc_type)
 
   // NOTE: Add your customize type here.
-  RegType(platform::float16, proto::VarType::FP16);
+  RegType(float16, proto::VarType::FP16);
   RegType(float, proto::VarType::FP32);
   RegType(double, proto::VarType::FP64);
   RegType(int, proto::VarType::INT32);
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
new file mode 100644
index 0000000000..54c41c55ba
--- /dev/null
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/data_type.h"
+
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+
+TEST(DataType, float16) {
+  using paddle::framework::Tensor;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::float16;
+  namespace f = paddle::framework;
+  f::proto::VarType::Type dtype = f::proto::VarType::FP16;
+
+  Tensor tensor;
+  CPUPlace cpu;
+  tensor.mutable_data(cpu, f::ToTypeIndex(dtype));
+
+  // test fp16 tensor
+  EXPECT_EQ(tensor.type(), std::type_index(typeid(float16)));
+
+  // test fp16 size
+  EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u);
+
+  // test debug info
+  std::string type = "float16";
+  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
+}
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index df55b3d054..8f6c4163d6 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,13 +1,13 @@
-cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto)
+cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
-cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph)
-cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
-cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
+cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
+cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
+cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
@@ -28,12 +28,9 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
-cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
+cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
 
-
-cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
-
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 700c73c745..bf493a3fa4 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 void AllReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("all_reduce", nullptr);
   if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
   } else {
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index b2e5399e23..8714a42162 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -21,6 +21,26 @@ namespace framework {
 namespace details {
 
 struct BuildStrategy {
+  // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
+  // kReduce, for CPU and GPU. If you use kAllReduce, different threads
+  // optimize their parameters separately. If you use kReduce, the optimizations
+  // of parameters are distributed to different threads.
+  // For example, a model has 100 parameters and is running with four threads,
+  // if you choose kAllReduce, every thread is to optimize 100 parameters
+  // separately, if you choose kReduce, every thread is to optimize 25
+  // parameters.
+  // Of particular note is, if you use kReduce when using CPU training,
+  // all the parameters are shared between different threads. This feature will
+  // save memory.
+  // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
+  // equal for GPU. Because, the result of the different order of summing maybe
+  // different, for example, the result of `a+b+c+d` may be different with the
+  // result of `c+a+b+d`.
+  // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
+  // so the result of kAllReduce and kReduce maybe not equal.
+  // For CPU, if you want to fix the order of summing to make the result
+  // of kAllReduce and kReduce no diff, you can add
+  // `FLAGS_cpu_deterministic=true` to env.
   enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
 
   enum class GradientScaleStrategy {
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
new file mode 100644
index 0000000000..6e302a2923
--- /dev/null
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ExceptionHolder {
+ public:
+  void Catch(const platform::EnforceNotMet& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset(new platform::EnforceNotMet(exp));
+    type_ = kEnforceNotMet;
+  }
+
+  void Catch(const platform::EOFException& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    // EOFException will not cover up existing EnforceNotMet.
+    if (exception_.get() == nullptr) {
+      exception_.reset(new platform::EOFException(exp));
+      type_ = kEOF;
+    }
+  }
+
+  bool ExceptionCatched() const {
+    std::lock_guard<std::mutex> lock(mu_);
+    return exception_.get() != nullptr;
+  }
+
+  void Throw() {
+    std::lock_guard<std::mutex> lock(mu_);
+    switch (type_) {
+      case kNone:
+        break;
+      case kEnforceNotMet: {
+        auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
+        throw e;
+        break;
+      }
+      case kEOF: {
+        auto e = *static_cast<platform::EOFException*>(exception_.get());
+        throw e;
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unknown exception.";
+    }
+    exception_.reset();
+    type_ = kNone;
+  }
+
+  void Clear() {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset();
+    type_ = kNone;
+  }
+
+ private:
+  enum ExceptionType { kNone, kEnforceNotMet, kEOF };
+  ExceptionType type_{kNone};
+
+  std::unique_ptr<std::exception> exception_;
+  mutable std::mutex mu_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
similarity index 75%
rename from paddle/fluid/framework/details/ssa_graph_checker.cc
rename to paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
index 7c79d7f1e8..c9c255864a 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
+bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const {
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   std::unordered_set<VarHandleBase *> ready_vars;
@@ -33,7 +33,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
     }
   };
 
-  for (auto &var_map : graph->Get<GraphVars>("vars")) {
+  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
         insert_pending_var(version_pair.get());
@@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
     }
   }
 
-  for (auto &var : graph->Get<GraphDepVars>("dep_vars")) {
+  for (auto &var : graph->Get<GraphDepVars>(kGraphDepVars)) {
     insert_pending_var(var.get());
   }
 
-  for (auto &op : graph->Get<GraphOps>("ops")) {
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
     if (op->Inputs().empty()) {
       ready_ops.insert(op.get());
     } else {
@@ -85,3 +85,10 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_devices_check_pass,
+              paddle::framework::details::SSAGraghBuilderWithChecker)
+    .RequireGraphAttr(paddle::framework::details::kGraphVars)
+    .RequireGraphAttr(paddle::framework::details::kGraphDepVars)
+    .RequireGraphAttr(paddle::framework::details::kGraphOps)
+    .RequireGraphAttr(paddle::framework::details::kShardedVarDevice);
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
similarity index 53%
rename from paddle/fluid/framework/details/ssa_graph_checker.h
rename to paddle/fluid/framework/details/multi_devices_graph_check_pass.h
index 2e397e8682..1e2b1867c3 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 
 #include <string>
 
@@ -22,26 +22,15 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
- public:
-  explicit SSAGraghBuilderWithChecker(
-      std::unique_ptr<SSAGraphBuilder>&& builder)
-      : builder_(std::move(builder)) {}
-
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
-    auto new_graph = builder_->Apply(std::move(graph));
-    PADDLE_ENFORCE(IsValidGraph(new_graph.get()));
-    return new_graph;
-  }
-
-  int GetVarDeviceID(const std::string& var_name) const override {
-    return builder_->GetVarDeviceID(var_name);
+class SSAGraghBuilderWithChecker : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
   }
 
-  bool IsValidGraph(const Graph* graph) const;
-
- private:
-  std::unique_ptr<SSAGraphBuilder> builder_;
+  bool IsValidGraph(const ir::Graph* graph) const;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
similarity index 61%
rename from paddle/fluid/framework/details/multi_devices_graph_builder.cc
rename to paddle/fluid/framework/details/multi_devices_graph_pass.cc
index f1f8674caf..c5a13e7e1f 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -21,10 +21,11 @@
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/data_balance_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
@@ -32,31 +33,109 @@
 namespace paddle {
 namespace framework {
 namespace details {
+namespace {
+void PolishGraphToSupportDataHazards(ir::Graph *graph) {
+  for (auto &var_map : graph->Get<GraphVars>(kGraphVars)) {
+    for (auto &name_pair : var_map) {
+      if (name_pair.second.size() <= 1) {
+        continue;
+      }
+      auto it_new = name_pair.second.rbegin();
+      auto it_old = name_pair.second.rbegin();
+      ++it_old;
+      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
+        OpHandleBase *write_op = (*it_new)->GeneratedOp();
+        const auto &read_ops = (*it_old)->PendingOps();
+
+        for (auto *read_op : read_ops) {
+          // Manually add a dependency var from read_op to write_op;
+          if (read_op == write_op) {
+            // Read Write is the same op.
+            continue;
+          }
+          bool has_dep = false;
+          for (auto *r_out : read_op->Outputs()) {
+            for (auto *w_in : write_op->Inputs()) {
+              if (r_out->Node() == w_in->Node()) {
+                has_dep = true;
+                break;
+              }
+            }
+          }
+          if (has_dep) continue;
+
+          auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+          read_op->AddOutput(dep_var);
+          write_op->AddInput(dep_var);
+          graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+        }
+      }
+    }
+  }
+}
+
+VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node,
+                                      const platform::Place &place,
+                                      size_t place_offset) {
+  auto &var_holders = graph->Get<GraphVars>(kGraphVars)[place_offset];
+  auto &var_holder = var_holders[node->Name()];
+  VarHandle *var = nullptr;
+  if (var_holder.empty()) {
+    if (node->Var()) {
+      var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
+                          node->Name(), place);
+    } else {
+      var = new VarHandle(
+          graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
+          place_offset, node->Name(), place);
+    }
+    var_holder.emplace_back(var);
+  } else {
+    var = var_holder.rbegin()->get();
+  }
+  return var;
+}
 
+void CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle,
+                    ir::Node *new_node, const platform::Place &place,
+                    size_t place_offset) {
+  auto &vars =
+      graph->Get<GraphVars>(kGraphVars)[place_offset][new_node->Name()];
+  size_t version = vars.size();
+  auto var =
+      new VarHandle(new_node, version, place_offset, new_node->Name(), place);
+  vars.emplace_back(var);
+  op_handle->AddOutput(var);
+}
+
+void AddOutputToLeafOps(ir::Graph *graph) {
+  for (auto &op : graph->Get<GraphOps>(kGraphOps)) {
+    if (!op->Outputs().empty()) {
+      continue;
+    }
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    op->AddOutput(dummy_leaf);
+  }
+}
+}  // namespace
+
+static const char kLossVarName[] = "loss_var_name";
+static const char kPlaces[] = "places";
+static const char kParams[] = "params";
+static const char kLocalScopes[] = "local_scopes";
+static const char kStrategy[] = "strategy";
+
+void MultiDevSSAGraphBuilder::Init() const {
+  loss_var_name_ = Get<const std::string>(kLossVarName);
+  places_ = Get<const std::vector<platform::Place>>(kPlaces);
+  local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
+  strategy_ = Get<const BuildStrategy>(kStrategy);
 #ifdef PADDLE_WITH_CUDA
-MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes,
-    platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy)
-    : loss_var_name_(loss_var_name),
-      places_(places),
-      local_scopes_(local_scopes),
-      nccl_ctxs_(nccl_ctxs),
-      strategy_(strategy) {
-#else
-MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
-    const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, const BuildStrategy &strategy)
-    : loss_var_name_(loss_var_name),
-      places_(places),
-      local_scopes_(local_scopes),
-      strategy_(strategy) {
+  nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
-  for (auto &p : params) {
+
+  for (auto &p : Get<const std::unordered_set<std::string>>(kParams)) {
     grad_names_.insert(GradVarName(p));
   }
   balance_vars_.resize(places_.size(), 0);
@@ -67,10 +146,11 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(Graph *result, ir::Node *node,
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result,
+                                                ir::Node *node,
                                                 size_t place_id) const {
   auto p = places_[place_id];
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
   op_handle->SetDeviceContext(p,
                               platform::DeviceContextPool::Instance().Get(p));
 
@@ -92,12 +172,11 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(Graph *result, ir::Node *node,
 }
 
 std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
-    const std::vector<std::unique_ptr<ir::Node>> &nodes) const {
+    const std::vector<ir::Node *> &nodes) const {
   std::vector<std::string> send_vars;
   // since parameters are all in block 0,
   // it's enough to only scan send ops in block 0
   for (auto &node : nodes) {
-    if (node->NodeType() != ir::Node::Type::kOperation) continue;
     OpDesc *op = node->Op();
     // TODO(Yancey1989): use a graceful method to find send op,
     // instead of the the hard code string
@@ -112,10 +191,9 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
 }
 
 std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
-    const std::vector<std::unique_ptr<ir::Node>> &nodes) const {
+    const std::vector<ir::Node *> &nodes) const {
   std::vector<std::string> recv_vars;
   for (auto &node : nodes) {
-    if (node->NodeType() != ir::Node::Type::kOperation) continue;
     OpDesc *op = node->Op();
     // TODO(Yancey1989): use a graceful method to find recv op,
     // instead of the hard code string
@@ -170,6 +248,7 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
     const std::vector<std::string> &var_names) const {
   int64_t numel_sum = 0;
   for (auto var_name : var_names) {
+    if (all_vars_.find(var_name) == all_vars_.end()) continue;
     auto var_desc = all_vars_.at(var_name);
     PADDLE_ENFORCE_NOT_NULL(var_desc);
     auto dim = framework::make_ddim(var_desc->GetShape());
@@ -186,30 +265,83 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
   return dev_id;
 }
 
-std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
-    std::unique_ptr<Graph> graph) const {
-  // Rebuild the graph structure.
-  auto nodes = std::move(graph->nodes);
-  graph->nodes.clear();
+// Topology sort the graph nodes from inputs to outputs.
+// Since SSAGraphBuilder depends on forward/backward nodes to assign devices
+// to parameter/gradients before optimizer ops, topo sort is insufficient. (
+// some optimizer ops might not depend on any nodes), we manually move all
+// optimizer nodes after last backward nodes.
+// However, the assumption by SSAGraphBuilder should be relaxed in the future.
+std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
+  std::vector<ir::Node *> ret = ir::TopologySortOperations(graph);
+  size_t last_backward = 0;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    if (boost::get<int>(
+            ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kBackward)) {
+      last_backward = i;
+    }
+  }
+
+  std::vector<ir::Node *> optimize_ops;
+  std::vector<ir::Node *> sorted_ret;
+  for (size_t i = 0; i < ret.size(); ++i) {
+    if (i < last_backward) {
+      if (boost::get<int>(ret[i]->Op()->GetAttr(
+              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+          static_cast<int>(OpRole::kOptimize)) {
+        optimize_ops.push_back(ret[i]);
+      } else {
+        sorted_ret.push_back(ret[i]);
+      }
+    } else if (i == last_backward) {
+      sorted_ret.push_back(ret[i]);
+      // Verify that no operations before optimize ops depends on optimize ops.
+      std::unordered_set<ir::Node *> optimize_set(optimize_ops.begin(),
+                                                  optimize_ops.end());
+      for (ir::Node *n : sorted_ret) {
+        for (ir::Node *in : n->inputs) {
+          for (ir::Node *pre_n : in->inputs) {
+            PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(),
+                           "optimize operations cannot be depended by forward "
+                           "or backward node %s -> %s",
+                           pre_n->Name(), n->Name());
+          }
+        }
+      }
+      sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(),
+                        optimize_ops.end());
+    } else {
+      sorted_ret.push_back(ret[i]);
+    }
+  }
+  return sorted_ret;
+}
+
+std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  Init();
+  // Give the topology sort order and rebuild the graph structure.
+  std::vector<ir::Node *> sorted_ops = SortOpsAndDelayOptimizeOp(*graph);
+  auto nodes = graph->ReleaseNodes();
+  ir::Graph &result = *graph;
 
   for (auto &node : nodes) {
-    if (node->NodeType() == ir::Node::Type::kVariable) {
+    if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) {
       all_vars_.emplace(node->Name(), node->Var());
     }
   }
-
-  Graph &result = *graph;
   std::unordered_set<std::string> og_has_been_broadcast;
 
   // We cannot invoke resize. It is a bug of GCC 4.8
-  result.Set("vars", new GraphVars(places_.size()));
-  result.Set("dep_vars", new GraphDepVars);
-  result.Set("ops", new GraphOps);
+  result.Set(kGraphVars, new GraphVars(places_.size()));
+  result.Set(kGraphDepVars, new GraphDepVars);
+  result.Set(kGraphOps, new GraphOps);
+  result.Set(kShardedVarDevice, new ShardedVarDevice);
 
   // find send/recv vars so that we can place the distributed training
-  // realted op in the place 0
-  auto send_vars = FindDistTrainSendVars(nodes);
-  auto recv_vars = FindDistTrainRecvVars(nodes);
+  // related op in the place 0
+  auto send_vars = FindDistTrainSendVars(sorted_ops);
+  auto recv_vars = FindDistTrainRecvVars(sorted_ops);
 
   std::vector<std::unordered_set<std::string>> bcast_var_name_set;
   bcast_var_name_set.resize(places_.size());
@@ -217,23 +349,20 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
   size_t cur_device_id = 0;
   bool is_forwarding = true;
 
-  // NOTE: Currently, passes before SSAGraphBuilder cannot reorder
-  // forward, backward nodes. E.g. you can't append an forward node
-  // at the end of the node list.
-  // TODO(panyx0718): FIXME: Needs to sort by forward->backward order.
-  for (auto &node : nodes) {
-    if (node->NodeType() != ir::Node::Type::kOperation) continue;
+  for (ir::Node *node : sorted_ops) {
     if (boost::get<int>(
             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
         static_cast<int>(OpRole::kRPC)) {
-      CreateRPCOp(&result, node.get());
-    } else if (IsDistTrainOp(node.get(), send_vars, recv_vars)) {
-      CreateDistTrainOp(&result, node.get());
-    } else if (IsScaleLossOp(node.get())) {
+      CreateRPCOp(&result, node);
+    } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
+      CreateDistTrainOp(&result, node);
+    } else if (IsScaleLossOp(node)) {
       // user can customize loss@grad if not use_default_grad_scale_
       if (strategy_.gradient_scale_ !=
           BuildStrategy::GradientScaleStrategy::kCustomized) {
-        CreateScaleLossGradOp(&result);
+        // TODO(paddle-dev): Why is there no input for this op_handle?
+        auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
+        CreateScaleLossGradOp(&result, loss_grad_name);
       }
       // This assumes the backward generating code will ensure IsScaleLossOp
       // is true only for the op that scale the final scalar loss.
@@ -241,24 +370,24 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
       // the block.
       is_forwarding = false;
     } else {
-      int op_dev_id = GetOpDeviceID(node.get());
+      int op_dev_id = GetOpDeviceID(result, node);
       if (op_dev_id != -1) {  // This op only runs on one specific device.
-        CreateComputationalOp(&result, node.get(), op_dev_id);
+        CreateComputationalOp(&result, node, op_dev_id);
         for (ir::Node *n : node->outputs) {
-          var_name_on_devices_.emplace(n->Name(), op_dev_id);
+          graph->Get<ShardedVarDevice>(kShardedVarDevice)
+              .emplace(n->Name(), op_dev_id);
         }
       } else {
         // This op runs on all devices, and its output may have parameter's
         // gradients.
+        // TODO(paddle-dev): Why is so special about "read" op?
         if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
           node->Op()->SetAttr("throw_eof_exp", false);
-          CreateComputationalOps(&result, node.get(), places_.size());
-          // TODO(paddle-dev): builder shouldn't depend on the out logic of
-          // a specific op.
+          CreateComputationalOps(&result, node, places_.size());
           const auto &data_var_names = node->Op()->Output("Out");
           InsertDataBalanceOp(&result, data_var_names);
         } else {
-          CreateComputationalOps(&result, node.get(), places_.size());
+          CreateComputationalOps(&result, node, places_.size());
         }
 
         if (!is_forwarding && places_.size() > 1) {
@@ -283,7 +412,8 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
                   case BuildStrategy::ReduceStrategy::kReduce:
                     cur_device_id = GetAppropriateDeviceID({g_name});
                     CreateReduceOp(&result, g_name, cur_device_id);
-                    var_name_on_devices_.emplace(g_name, cur_device_id);
+                    graph->Get<ShardedVarDevice>(kShardedVarDevice)
+                        .emplace(g_name, cur_device_id);
                     bcast_var_name_set[cur_device_id].emplace(p_name);
                     break;
                   case BuildStrategy::ReduceStrategy::kAllReduce:
@@ -322,17 +452,17 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
       }
     }
   }
-
   /*
-    Dependency graph has been constructed. However, there are still data
-    hazards need to be handled.
-   */
+  Dependency graph has been constructed. However, there are still data
+  hazards need to be handled.
+ */
   PolishGraphToSupportDataHazards(&result);
 
   /*
    * Only variables should be the leaves of graph.
    */
   AddOutputToLeafOps(&result);
+  PADDLE_ENFORCE(!ir::HasCircle(result));
   return graph;
 }
 
@@ -357,7 +487,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
 #endif
 }
 
-void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
+void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
                                                 const std::string &p_name,
                                                 size_t src_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
@@ -369,16 +499,16 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
       result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
       local_scopes_, places_);
 #endif
-  result->Get<GraphOps>("ops").emplace_back(op_handle);
+  result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
   auto *in =
-      result->Get<GraphVars>("vars").at(src_dev_id).at(p_name).back().get();
+      result->Get<GraphVars>(kGraphVars).at(src_dev_id).at(p_name).back().get();
   op_handle->AddInput(in);
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->Get<GraphVars>("vars").at(i).at(p_name);
+    auto &vars = result->Get<GraphVars>(kGraphVars).at(i).at(p_name);
     auto *out_var = new VarHandle(
         result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(),
         i, p_name, p);
@@ -387,32 +517,32 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOp(Graph *result,
+void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                     ir::Node *node,
                                                     int dev_id) const {
-  result->Get<GraphOps>("ops").emplace_back(
+  result->Get<GraphOps>(kGraphOps).emplace_back(
       new ComputationOpHandle(result->CreateOpNode(node->Op()),
                               local_scopes_[dev_id], places_[dev_id]));
   CreateOpHandleIOs(result, node, dev_id);
 }
 
-void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result,
+void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
                                                 const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
-  result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
 #else
-  result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->Get<GraphVars>("vars")[i][og];
+    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad.get());
@@ -426,22 +556,22 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result,
 }
 
 void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
-    Graph *result, const std::vector<std::string> &datas) const {
+    ir::Graph *result, const std::vector<std::string> &datas) const {
 #ifdef PADDLE_WITH_CUDA
-  result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
       result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
 #else
-  result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
       result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
     for (const std::string &d_name : datas) {
-      auto &vars = result->Get<GraphVars>("vars")[i][d_name];
+      auto &vars = result->Get<GraphVars>(kGraphVars)[i][d_name];
       PADDLE_ENFORCE(!vars.empty());
       op_handle->AddInput(vars.back().get());
       auto var = new VarHandle(
@@ -465,7 +595,8 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
   return is_pg_once;
 }
 
-int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
+int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
+                                           ir::Node *node) const {
   if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
     return -1;
   }
@@ -478,18 +609,21 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
       node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
 
   PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
-  int dev_id = GetVarDeviceID(param_grad[1]);
-  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]",
-                    node->Op()->Type(), param_grad[0]);
+  int dev_id = GetVarDeviceID(graph, param_grad[1]);
+  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
+                    node->Op()->Type(), param_grad[0], param_grad[1]);
   return dev_id;
 }
 
-int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
-  auto got = var_name_on_devices_.find(varname);
-  return got == var_name_on_devices_.end() ? -1 : got->second;
+int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
+                                            const std::string &varname) const {
+  auto &sharded_var_device = graph.Get<ShardedVarDevice>(kShardedVarDevice);
+  auto got = sharded_var_device.find(varname);
+  return got == sharded_var_device.end() ? -1 : got->second;
 }
 
-void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
+    ir::Graph *result, const std::string &loss_grad_name) const {
   for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
 #ifdef PADDLE_WITH_CUDA
@@ -504,7 +638,7 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
         result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
         local_scopes_.size(), local_scopes_[i], places_[i],
         communication_dev_ctx);
-    result->Get<GraphOps>("ops").emplace_back(op_handle);
+    result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
 
     // FIXME: Currently ScaleLossGradOp only use device_count as scale
     // factor. So it does not depend on any other operators.
@@ -512,48 +646,48 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
     // loss->pending_ops_.emplace_back(op_handle);
     // op_handle->inputs_.emplace_back(loss);
 
-    CreateOpOutput(result, op_handle,
-                   result->CreateEmptyNode(GradVarName(loss_var_name_),
-                                           ir::Node::Type::kVariable),
-                   places_[i], i);
+    CreateOpOutput(
+        result, op_handle,
+        result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
+        places_[i], i);
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateComputationalOps(Graph *result,
+void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
                                                      ir::Node *node,
                                                      size_t num_places) const {
   for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
     auto p = places_[scope_idx];
     auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>("ops").emplace_back(
+    result->Get<GraphOps>(kGraphOps).emplace_back(
         new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
     CreateOpHandleIOs(result, node, scope_idx);
   }
 }
 
-VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result,
+VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
                                                    const std::string &og,
                                                    int dst_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
-  result->Get<GraphOps>("ops").emplace_back(new ReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
 #else
-  result->Get<GraphOps>("ops").emplace_back(new ReduceOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_, places_));
 #endif
-  auto *op_handle = result->Get<GraphOps>("ops").back().get();
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
 
   for (size_t i = 0; i < places_.size(); ++i) {
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
-    auto &vars = result->Get<GraphVars>("vars")[i][og];
+    auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
     PADDLE_ENFORCE(!vars.empty());
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad.get());
   }
-  auto &vars = result->Get<GraphVars>("vars")[dst_dev_id][og];
+  auto &vars = result->Get<GraphVars>(kGraphVars)[dst_dev_id][og];
   auto var =
       new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
                     vars.size(), dst_dev_id, og, places_[dst_dev_id]);
@@ -564,20 +698,19 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result,
 
 // Find the first occurence of `prev_op_name` and make current `op` depend
 // on it.
-void MultiDevSSAGraphBuilder::ConnectOp(Graph *result, OpHandleBase *op,
+void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
                                         const std::string &prev_op_name) const {
-  for (auto &prev_op : result->Get<GraphOps>("ops")) {
+  for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
     if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle(
-          result->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
+      auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
       prev_op->AddOutput(dep_var);
-      result->Get<GraphDepVars>("dep_vars").emplace(dep_var);
+      result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
       op->AddInput(dep_var);
     }
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
+void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                                                 ir::Node *node) const {
   int op_dev_id = -1;
   std::vector<std::string> input_var_names;
@@ -591,20 +724,24 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
 
   if (node->Op()->Type() == "split_byref" ||
       node->Op()->Type() == "split_selected_rows") {
-    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    // TODO(paddle-dev): getting the first var is not safe.
+    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
       op_dev_id = GetAppropriateDeviceID(input_var_names);
       for (auto &varname : input_var_names) {
-        var_name_on_devices_.emplace(varname, op_dev_id);
+        result->Get<ShardedVarDevice>(kShardedVarDevice)
+            .emplace(varname, op_dev_id);
       }
     }
     for (auto &varname : output_var_names) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
     }
   } else if (node->Op()->Type() == "concat") {
-    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
     for (auto &varname : output_var_names) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
     }
   } else {
     PADDLE_ENFORCE(
@@ -618,16 +755,20 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
 
   CreateComputationalOp(result, node, op_dev_id);
   if (node->Op()->Type() == "concat") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(),
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
               "fetch_barrier");
   }
 }
 
 // Create RPC related op handles that connects its in ops and out ops.
-void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
+void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
+                                          ir::Node *node) const {
   int op_dev_id = -1;
   if (node->Op()->Type() == "send") {
-    op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
+    // TODO(paddle-dev): getting the first var is not safe.
+    op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name());
+    PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
+                   "This hack no longer holds, please fix.");
     // the variable name which contains .block means it was splited by
     // split_byref op
     // so that we can balance the variable blocks to all the pserver
@@ -640,7 +781,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
       }
       op_dev_id = GetAppropriateDeviceID(input_var_names);
       for (auto &varname : input_var_names) {
-        var_name_on_devices_.emplace(varname, op_dev_id);
+        result->Get<ShardedVarDevice>(kShardedVarDevice)
+            .emplace(varname, op_dev_id);
       }
     }
   } else if (node->Op()->Type() == "recv") {
@@ -650,7 +792,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
     }
     op_dev_id = GetAppropriateDeviceID(output_var_names);
     for (auto &varname : output_var_names) {
-      var_name_on_devices_.emplace(varname, op_dev_id);
+      result->Get<ShardedVarDevice>(kShardedVarDevice)
+          .emplace(varname, op_dev_id);
     }
   } else {
     // send_barrier and fetch_barrier op can be scheduled on device 0
@@ -660,17 +803,18 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
   PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
                  node->Op()->Type());
 
-  result->Get<GraphOps>("ops").emplace_back(new RPCOpHandle(
+  result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
       result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
       node->Op()->Type(), places_[op_dev_id]));
 
+  // TODO(panyx0718): This might not be needed anymore.
   if (node->Op()->Type() == "send_barrier") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "send");
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "send");
   } else if (node->Op()->Type() == "recv") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(),
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
               "send_barrier");
   } else if (node->Op()->Type() == "fetch_barrier") {
-    ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "recv");
+    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "recv");
   } else if (node->Op()->Type() == "send") {
     // do nothing
   } else {
@@ -692,3 +836,11 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_devices_pass,
+              paddle::framework::details::MultiDevSSAGraphBuilder)
+    .RequirePassAttr(paddle::framework::details::kLossVarName)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kParams)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes)
+    .RequirePassAttr(paddle::framework::details::kStrategy);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
similarity index 50%
rename from paddle/fluid/framework/details/multi_devices_graph_builder.h
rename to paddle/fluid/framework/details/multi_devices_graph_pass.h
index 2b7f4f586b..7a6f238f9c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
@@ -30,42 +30,32 @@ namespace framework {
 class Scope;
 namespace details {
 
-class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
- public:
-#ifdef PADDLE_WITH_CUDA
-  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
-                          const std::string &loss_var_name,
-                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          platform::NCCLContextMap *nccl_ctxs,
-                          const BuildStrategy &strategy);
-#else
-  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
-                          const std::string &loss_var_name,
-                          const std::unordered_set<std::string> &params,
-                          const std::vector<Scope *> &local_scopes,
-                          const BuildStrategy &strategy);
-#endif
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override;
-  int GetVarDeviceID(const std::string &varname) const override;
+class MultiDevSSAGraphBuilder : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 
  private:
-  void CreateOpHandleIOs(Graph *result, ir::Node *node, size_t device_id) const;
+  void CreateOpHandleIOs(ir::Graph *result, ir::Node *node,
+                         size_t device_id) const;
+  void Init() const;
 
  private:
-  std::string loss_var_name_;
-  const std::vector<platform::Place> &places_;
-  const std::vector<Scope *> &local_scopes_;
-  std::unordered_set<std::string> grad_names_;
+  mutable std::string loss_var_name_;
+  mutable std::vector<platform::Place> places_;
+  mutable std::vector<Scope *> local_scopes_;
+  mutable std::unordered_set<std::string> grad_names_;
 
 #ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap *nccl_ctxs_;
+  mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
+  int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const;
+
   bool IsScaleLossOp(ir::Node *node) const;
 
-  void CreateRPCOp(Graph *result, ir::Node *node) const;
-  void CreateDistTrainOp(Graph *result, ir::Node *node) const;
+  void CreateRPCOp(ir::Graph *result, ir::Node *node) const;
+  void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
 
   /**
    * Is this operator as the end-point operator before/after send operator.
@@ -74,34 +64,37 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                      const std::vector<std::string> &recv_vars) const;
 
   std::vector<std::string> FindDistTrainSendVars(
-      const std::vector<std::unique_ptr<ir::Node>> &nodes) const;
+      const std::vector<ir::Node *> &nodes) const;
 
   std::vector<std::string> FindDistTrainRecvVars(
-      const std::vector<std::unique_ptr<ir::Node>> &nodes) const;
+      const std::vector<ir::Node *> &nodes) const;
 
-  void ConnectOp(Graph *result, OpHandleBase *op,
+  void ConnectOp(ir::Graph *result, OpHandleBase *op,
                  const std::string &prev_op_name) const;
 
-  void CreateComputationalOps(Graph *result, ir::Node *node,
+  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                               size_t num_places) const;
 
-  void CreateScaleLossGradOp(Graph *result) const;
-  VarHandle *CreateReduceOp(Graph *result, const std::string &og,
+  void CreateScaleLossGradOp(ir::Graph *result,
+                             const std::string &loss_grad_name) const;
+
+  VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
-  void CreateComputationalOp(Graph *result, ir::Node *node, int dev_id) const;
+  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
+                             int dev_id) const;
 
   bool IsParameterGradientOnce(
       const std::string &og,
       std::unordered_set<std::string> *og_has_been_broadcast) const;
 
-  int GetOpDeviceID(ir::Node *node) const;
+  int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
 
-  void InsertAllReduceOp(Graph *result, const std::string &og) const;
+  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
 
-  void InsertDataBalanceOp(Graph *result,
+  void InsertDataBalanceOp(ir::Graph *result,
                            const std::vector<std::string> &datas) const;
 
-  void CreateBroadcastOp(Graph *result, const std::string &p_name,
+  void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
   bool IsSparseGradient(const std::string &og) const;
@@ -110,9 +103,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
       const std::vector<std::string> &var_names) const;
 
  private:
-  BuildStrategy strategy_;
+  mutable BuildStrategy strategy_;
   mutable std::unordered_map<std::string, VarDesc *> all_vars_;
-  mutable std::unordered_map<std::string, int> var_name_on_devices_;
   mutable std::vector<int64_t> balance_vars_;
 
   void SetCommunicationContext(OpHandleBase *op_handle,
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
similarity index 82%
rename from paddle/fluid/framework/details/ssa_graph_printer.cc
rename to paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
index 6dd6fd262e..69944a42b6 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 
@@ -21,8 +21,8 @@ namespace framework {
 namespace details {
 
 template <typename Callback>
-static inline void IterAllVar(const Graph &graph, Callback callback) {
-  for (auto &each : graph.Get<GraphVars>("vars")) {
+static inline void IterAllVar(const ir::Graph &graph, Callback callback) {
+  for (auto &each : graph.Get<GraphVars>(kGraphVars)) {
     for (auto &pair1 : each) {
       for (auto &pair2 : pair1.second) {
         callback(*pair2);
@@ -30,12 +30,12 @@ static inline void IterAllVar(const Graph &graph, Callback callback) {
     }
   }
 
-  for (auto &var : graph.Get<GraphDepVars>("dep_vars")) {
+  for (auto &var : graph.Get<GraphDepVars>(kGraphDepVars)) {
     callback(*var);
   }
 }
 
-void GraphvizSSAGraphPrinter::Print(const Graph &graph,
+void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
                                     std::ostream &sout) const {
   size_t var_id = 0;
   std::unordered_map<const VarHandleBase *, size_t> vars;
@@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const Graph &graph,
   });
 
   size_t op_id = 0;
-  for (auto &op : graph.Get<GraphOps>("ops")) {
+  for (auto &op : graph.Get<GraphOps>(kGraphOps)) {
     std::string op_name = "op_" + std::to_string(op_id++);
     sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
          << std::endl;
@@ -81,3 +81,6 @@ void GraphvizSSAGraphPrinter::Print(const Graph &graph,
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
+
+REGISTER_PASS(multi_devices_print_pass,
+              paddle::framework::details::SSAGraghBuilderWithPrinter);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
new file mode 100644
index 0000000000..c00685fa16
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <fstream>
+#include <iosfwd>
+#include <ostream>
+#include <string>
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class SSAGraphPrinter {
+ public:
+  virtual ~SSAGraphPrinter() {}
+  virtual void Print(const ir::Graph& graph, std::ostream& sout) const = 0;
+};
+
+class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
+ public:
+  void Print(const ir::Graph& graph, std::ostream& sout) const override;
+};
+
+class SSAGraghBuilderWithPrinter : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(Get<const std::string>("debug_graphviz_path")));
+    PADDLE_ENFORCE(fout->good());
+    Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
+    return graph;
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_helper.cc b/paddle/fluid/framework/details/multi_devices_helper.cc
new file mode 100644
index 0000000000..0242274a16
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_helper.cc
@@ -0,0 +1,20 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/multi_devices_helper.h
similarity index 61%
rename from paddle/fluid/framework/details/ssa_graph_builder.h
rename to paddle/fluid/framework/details/multi_devices_helper.h
index e8e8acdb38..175c5a9950 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -39,45 +39,19 @@ namespace details {
 typedef std::vector<
     std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
     GraphVars;
+const char kGraphVars[] = "vars";
 
 // aux variables to represent dependency. Useful to resolve data hazard.
 typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
+const char kGraphDepVars[] = "dep_vars";
 
 // all operators. NOTE that even we use a vector here, the operators is
 // unordered.
 typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
+const char kGraphOps[] = "ops";
 
-class SSAGraphBuilder : public ir::Pass {
- public:
-  SSAGraphBuilder() {}
-  virtual ~SSAGraphBuilder() {}
-
-  virtual int GetVarDeviceID(const std::string &var_name) const = 0;
-
-  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
-
- protected:
-  /**
-   * We only handle write after read(WAR), since it should not have a write
-   * after write in program. If there are write after write operators, we need
-   * prune them.
-   *
-   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
-   */
-  static void PolishGraphToSupportDataHazards(Graph *graph);
-
-  static VarHandle *CreateOrGetLatestVarHandle(Graph *graph, ir::Node *node,
-                                               const platform::Place &place,
-                                               size_t place_offset);
-
-  // Add an output variable (each_var_name, place, place_offset) to op_handle,
-  // which belongs to graph
-  static void CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
-                             ir::Node *new_node, const platform::Place &place,
-                             size_t place_offset);
-
-  static void AddOutputToLeafOps(Graph *graph);
-};
+typedef std::unordered_map<std::string, int> ShardedVarDevice;
+const char kShardedVarDevice[] = "sharded_var_device";
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 7160e346da..6c7e5c1fb0 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -16,12 +16,18 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_bool(
+    cpu_deterministic, false,
+    "Whether to make the result of computation deterministic in CPU side.");
 
 namespace paddle {
 namespace framework {
 namespace details {
 
 void ReduceOpHandle::RunImpl() {
+  platform::RecordEvent r("reduce", nullptr);
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
   auto in_var_handles = DynamicCast<VarHandle>(inputs_);
@@ -89,11 +95,33 @@ void ReduceOpHandle::RunImpl() {
   } else {
     std::vector<const LoDTensor *> lod_tensors =
         GetInputValues<LoDTensor>(in_var_handles, var_scopes);
+
     if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
       this->RunAndRecordEvent([&] {
-        ReduceLoDTensor func(lod_tensors,
-                             out_var->GetMutable<framework::LoDTensor>());
-        VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        // FIXME(zcd): The order of summing is important,
+        // especially when the type of data is float or double.
+        // For example, the result of `a+b+c+d` may be different
+        // with the result of `c+a+b+d`, so the summing order should be fixed.
+        if (!FLAGS_cpu_deterministic) {
+          ReduceLoDTensor func(lod_tensors,
+                               out_var->GetMutable<framework::LoDTensor>());
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+        } else {
+          // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0
+          // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0.
+          auto &reduce_sum_trg = *this->local_scopes_[0]
+                                      ->FindVar(kLocalExecScopeName)
+                                      ->Get<Scope *>()
+                                      ->FindVar(out_var_handle->name_)
+                                      ->GetMutable<framework::LoDTensor>();
+          ReduceLoDTensor func(lod_tensors, &reduce_sum_trg);
+          VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+
+          auto trg = out_var->GetMutable<framework::LoDTensor>();
+          if (reduce_sum_trg.data<void>() != trg->data<void>()) {
+            TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
+          }
+        }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index 924ff4d118..f44b374edb 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
 namespace framework {
@@ -33,7 +34,7 @@ void RPCOpHandle::RunImpl() {
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place_;
     // FIXME(Yancey1989): need a better solution instead of use DebugString()
-    if (in->DebugString() == "dummy") {  // HACK
+    if (ir::IsControlDepVar(*in->Node())) {  // HACK
       continue;
     }
     if (in->GeneratedOp()) {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 1d80bab90f..5bd974d6b7 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     eptr = std::current_exception();
   }
 
+  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
   drop_scope_counter_ += 1;
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index cbfbcb1c0c..5e87e0bf50 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -40,6 +40,11 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
       ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
       std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
       std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
+
+  const ir::Graph& Graph() const override {
+    return underlying_executor_->Graph();
+  }
+
   FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
 
  private:
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
deleted file mode 100644
index 7bc130ef6e..0000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-#include <utility>
-
-namespace paddle {
-namespace framework {
-namespace details {
-void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) {
-  for (auto &var_map : graph->Get<GraphVars>("vars")) {
-    for (auto &name_pair : var_map) {
-      if (name_pair.second.size() <= 1) {
-        continue;
-      }
-      auto it_new = name_pair.second.rbegin();
-      auto it_old = name_pair.second.rbegin();
-      ++it_old;
-      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        OpHandleBase *write_op = (*it_new)->GeneratedOp();
-        const auto &read_ops = (*it_old)->PendingOps();
-
-        for (auto *read_op : read_ops) {
-          // Manually add a dependency var from read_op to write_op;
-          if (read_op == write_op) {
-            // Read Write is the same op.
-            continue;
-          }
-
-          auto *dep_var = new DummyVarHandle(
-              graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
-          read_op->AddOutput(dep_var);
-          write_op->AddInput(dep_var);
-          graph->Get<GraphDepVars>("dep_vars").emplace(dep_var);
-        }
-      }
-    }
-  }
-}
-
-VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
-    Graph *graph, ir::Node *node, const platform::Place &place,
-    size_t place_offset) {
-  auto &var_holders = graph->Get<GraphVars>("vars")[place_offset];
-  auto &var_holder = var_holders[node->Name()];
-  VarHandle *var = nullptr;
-  if (var_holder.empty()) {
-    if (node->Var()) {
-      var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
-                          node->Name(), place);
-    } else {
-      var = new VarHandle(
-          graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
-          place_offset, node->Name(), place);
-    }
-    var_holder.emplace_back(var);
-  } else {
-    var = var_holder.rbegin()->get();
-  }
-  return var;
-}
-
-void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
-                                     ir::Node *new_node,
-                                     const platform::Place &place,
-                                     size_t place_offset) {
-  auto &vars = graph->Get<GraphVars>("vars")[place_offset][new_node->Name()];
-  size_t version = vars.size();
-  auto var =
-      new VarHandle(new_node, version, place_offset, new_node->Name(), place);
-  vars.emplace_back(var);
-  op_handle->AddOutput(var);
-}
-
-void SSAGraphBuilder::AddOutputToLeafOps(Graph *graph) {
-  for (auto &op : graph->Get<GraphOps>("ops")) {
-    if (!op->Outputs().empty()) {
-      continue;
-    }
-    auto *dummy_leaf = new DummyVarHandle(
-        graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
-    graph->Get<GraphDepVars>("dep_vars").emplace(dummy_leaf);
-    op->AddOutput(dummy_leaf);
-  }
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
deleted file mode 100644
index b4b49d3de6..0000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
-#include <fstream>
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
-#include "paddle/fluid/framework/details/ssa_graph_checker.h"
-#include "paddle/fluid/framework/details/ssa_graph_printer.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
-  std::unique_ptr<SSAGraphBuilder> res(
-#ifdef PADDLE_WITH_CUDA
-      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
-                                  local_scopes_, nccl_ctxs_, strategy_)
-#else
-      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
-                                  local_scopes_, strategy_)
-#endif
-          );  // NOLINT
-
-  if (!strategy_.debug_graphviz_path_.empty()) {
-    std::unique_ptr<std::ostream> fout(
-        new std::ofstream(strategy_.debug_graphviz_path_));
-    PADDLE_ENFORCE(fout->good());
-    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
-        new GraphvizSSAGraphPrinter());
-    res.reset(new SSAGraghBuilderWithPrinter(
-        std::move(fout), std::move(graphviz_printer), std::move(res)));
-  }
-  res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
-
-  return res;
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
deleted file mode 100644
index 91a119de83..0000000000
--- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-#include "paddle/fluid/platform/place.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-class Scope;
-namespace details {
-
-class SSAGraphBuilderFactory {
- public:
-  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
-                         const std::string& loss_var_name,
-                         const std::unordered_set<std::string>& param_names,
-                         const std::vector<Scope*>& local_scopes,
-                         const BuildStrategy& strategy)
-      : places_(places),
-        loss_var_name_(loss_var_name),
-        param_names_(param_names),
-        local_scopes_(local_scopes),
-        strategy_(strategy) {
-#ifdef PADDLE_WITH_CUDA
-    nccl_ctxs_ = nullptr;
-#endif
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
-    nccl_ctxs_ = nccl_ctxs;
-  }
-#endif
-
-  std::unique_ptr<SSAGraphBuilder> Create();
-
- private:
-  std::vector<platform::Place> places_;
-  std::string loss_var_name_;
-  std::unordered_set<std::string> param_names_;
-  std::vector<Scope*> local_scopes_;
-  BuildStrategy strategy_;
-
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap* nccl_ctxs_;
-#endif
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index 8815ec89b2..96fffb7d94 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -32,7 +32,9 @@ class SSAGraphExecutor {
 
   virtual ~SSAGraphExecutor();
 
-  virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
+  virtual const ir::Graph& Graph() const = 0;
+
+  virtual FeedFetchList Run(const std::vector<std::string>& fetch_tensors) = 0;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
deleted file mode 100644
index cd72162f44..0000000000
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iosfwd>
-#include <string>
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class SSAGraphPrinter {
- public:
-  virtual ~SSAGraphPrinter() {}
-  virtual void Print(const Graph& graph, std::ostream& sout) const = 0;
-};
-
-class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
- public:
-  void Print(const Graph& graph, std::ostream& sout) const override;
-};
-
-class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
- public:
-  SSAGraghBuilderWithPrinter(std::ostream& sout,
-                             std::unique_ptr<SSAGraphPrinter>&& printer,
-                             std::unique_ptr<SSAGraphBuilder>&& builder)
-      : printer_(std::move(printer)),
-        builder_(std::move(builder)),
-        stream_ref_(sout) {}
-
-  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
-                             std::unique_ptr<SSAGraphPrinter>&& printer,
-                             std::unique_ptr<SSAGraphBuilder>&& builder)
-      : printer_(std::move(printer)),
-        builder_(std::move(builder)),
-        stream_ptr_(std::move(sout)),
-        stream_ref_(*stream_ptr_) {}
-
-  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
-    auto new_graph = builder_->Apply(std::move(graph));
-    printer_->Print(*new_graph, stream_ref_);
-    return new_graph;
-  }
-
-  int GetVarDeviceID(const std::string& var_name) const override {
-    return builder_->GetVarDeviceID(var_name);
-  }
-
- private:
-  std::unique_ptr<SSAGraphPrinter> printer_;
-  std::unique_ptr<SSAGraphBuilder> builder_;
-  std::unique_ptr<std::ostream> stream_ptr_;
-  std::ostream& stream_ref_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index f85c62dd6c..994bb6492f 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,14 +14,16 @@
 
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 
-#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places, std::unique_ptr<Graph> &&graph)
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> &&graph)
     : graph_(std::move(graph)),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
@@ -33,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
+  std::unique_ptr<platform::RecordEvent> event(
+      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
   BlockingQueue<VarHandleBase *> ready_vars;
@@ -44,18 +48,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   std::unordered_set<OpHandleBase *> delayed_ops;
 
   // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
         InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
       }
     }
   }
-  for (auto &var : graph_->Get<details::GraphDepVars>("dep_vars")) {
+  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
     InsertPendingVar(&pending_vars, &ready_vars, var.get());
   }
 
-  for (auto &op : graph_->Get<details::GraphOps>("ops")) {
+  for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
     if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op.get());
     } else {
@@ -82,7 +86,8 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
   // Clean run context
   run_op_futures_.clear();
-  exception_.reset();
+  exception_holder_.Clear();
+  event.reset(nullptr);
 
   // Step 3. Execution
   while (!pending_vars.empty()) {
@@ -102,23 +107,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
-      std::unique_lock<std::mutex> l(exception_mu_);
-      if (exception_) {
-        l.unlock();
+      if (exception_holder_.ExceptionCatched()) {
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
-        l.lock();
-        std::exception *exp = exception_.get();
-        if (dynamic_cast<platform::EOFException *>(exp)) {
-          auto e = *static_cast<platform::EOFException *>(exp);
-          throw e;
-        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
-          auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          throw e;
-        } else {
-          LOG(FATAL) << "Unknown exception.";
-        }
+        exception_holder_.Throw();
       } else {
         continue;
       }
@@ -161,7 +154,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
 
   for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
         fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
@@ -228,14 +221,9 @@ void ThreadedSSAGraphExecutor::RunOp(
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EOFException ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      // EOFException will not cover up existing EnforceNotMet.
-      if (exception_.get() == nullptr) {
-        exception_.reset(new platform::EOFException(ex));
-      }
+      exception_holder_.Catch(ex);
     } catch (platform::EnforceNotMet ex) {
-      std::lock_guard<std::mutex> l(exception_mu_);
-      exception_.reset(new platform::EnforceNotMet(ex));
+      exception_holder_.Catch(ex);
     } catch (...) {
       LOG(FATAL) << "Unknown exception catched";
     }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index bf7c0a367a..9135c1f5d4 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -24,6 +24,7 @@
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
@@ -40,8 +41,9 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<Graph> &&graph);
+                           std::unique_ptr<ir::Graph> &&graph);
 
+  const ir::Graph &Graph() const override { return *graph_; }
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
@@ -53,13 +55,12 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
-  std::unique_ptr<Graph> graph_;
+  std::unique_ptr<ir::Graph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
-  std::mutex exception_mu_;
-  std::unique_ptr<std::exception> exception_;
+  ExceptionHolder exception_holder_;
   std::atomic<int> running_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
index 6f00abd947..5457870e9f 100644
--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -26,7 +26,7 @@ std::string VarHandle::DebugString() const {
   return ss.str();
 }
 
-std::string DummyVarHandle::DebugString() const { return "dummy"; }
+std::string DummyVarHandle::DebugString() const { return node_->Name(); }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index c2800c972a..dad170ed78 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -330,12 +330,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   for (auto& op : ctx->ops_) {
-    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
-    // NOTE! Please do not delete this line, it's usefull because the debug
-    // string before and after op.run are different, after run the output
-    // will have right shape which is usefull for debug.
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
 
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ee0604383e..bf7d76a8a6 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -1,5 +1,9 @@
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node)
-cc_library(pass SRCS pass.cc DEPS graph node)
+cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
+cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
+cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
 
-cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry)
+cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
+cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
+cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index e4021aa92b..f87d5212c0 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -12,14 +12,80 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+#include <unordered_set>
+
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 
 namespace paddle {
 namespace framework {
+namespace ir {
+
+std::vector<std::string> FindDistTrainSendVars(
+    const std::vector<ir::Node *> &nodes) {
+  std::vector<std::string> send_vars;
+  // since parameters are all in block 0,
+  // it's enough to only scan send ops in block 0
+  for (auto &node : nodes) {
+    auto op_vars = node->Op()->InputArgumentNames();
+    send_vars.reserve(send_vars.size() +
+                      std::distance(op_vars.begin(), op_vars.end()));
+    send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
+  }
+  return send_vars;
+}
+
+std::vector<std::string> FindDistTrainRecvVars(
+    const std::vector<ir::Node *> &nodes) {
+  std::vector<std::string> recv_vars;
+  for (auto &node : nodes) {
+    auto op_vars = node->Op()->OutputArgumentNames();
+    recv_vars.reserve(recv_vars.size() +
+                      std::distance(op_vars.begin(), op_vars.end()));
+    recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
+  }
+  return recv_vars;
+}
+
+bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
+                   const std::vector<std::string> &recv_vars) {
+  if (send_vars.size() == 0 || recv_vars.size() == 0) {
+    return false;
+  }
+
+  /**
+   * Check any of opvars contains `.block` and in sendvars
+   */
+  auto checker = [](const std::vector<std::string> &opvars,
+                    const std::vector<std::string> &rpc_vars) -> bool {
+    for (auto &var : opvars) {
+      // a variable name with the suffix `.block` means it's a splited
+      // variable by (DistributeTranspiler)
+      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
+      if (var.find(".block") != std::string::npos &&
+          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  return checker(output_var_names, send_vars) ||
+         checker(input_var_names, recv_vars);
+}
 
-// NOTE(paddle-dev): This graph contains circle.
 Graph::Graph(const ProgramDesc &program) : program_(program) {
   VLOG(3) << "block in program:" << program_.Size();
   std::unordered_map<std::string, VarDesc *> all_vars;
@@ -27,40 +93,160 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
     all_vars.emplace(var->Name(), var);
   }
 
-  std::map<std::string, ir::Node *> var_nodes;
+  std::map<std::string, std::vector<ir::Node *>> var_nodes;
   for (auto *op : program.Block(0).AllOps()) {
     ir::Node *node = CreateOpNode(op);
-
+    // For input args, reuse the same var name if it was created before.
+    // Otherwise, create a new one.
     for (auto &each_var_name : op->InputArgumentNames()) {
       ir::Node *var = nullptr;
       if (var_nodes.find(each_var_name) != var_nodes.end()) {
-        var = var_nodes.at(each_var_name);
+        var = var_nodes.at(each_var_name).back();
       } else if (all_vars.count(each_var_name) != 0) {
         var = CreateVarNode(all_vars.at(each_var_name));
-        var_nodes[each_var_name] = var;
+        var_nodes[each_var_name].push_back(var);
       } else {
-        // TODO(paddle-dev): Seems some assumption doesn't hold?
-        VLOG(3) << op->Type()
-                << " input var not in all_var list: " << each_var_name;
+        // Operation input var can be optional (dispensable). Which means
+        // the operation doesn't really need the var at runtime. In this
+        // case, the no-existed var is ready at the beginning.
         var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
-        var_nodes[each_var_name] = var;
+        var_nodes[each_var_name].push_back(var);
       }
       node->inputs.push_back(var);
       var->outputs.push_back(node);
     }
-
+    // For output args, always create a new var.
     for (auto &each_var_name : op->OutputArgumentNames()) {
-      ir::Node *var = nullptr;
-      if (var_nodes.find(each_var_name) != var_nodes.end()) {
-        var = var_nodes.at(each_var_name);
-      } else {
-        var = CreateVarNode(all_vars.at(each_var_name));
-        var_nodes[each_var_name] = var;
-      }
+      ir::Node *var = CreateVarNode(all_vars.at(each_var_name));
+      var_nodes[each_var_name].push_back(var);
       node->outputs.push_back(var);
       var->inputs.push_back(node);
     }
   }
+
+  std::vector<ir::Node *> send_ops;
+  ir::Node *send_bar = nullptr;
+  std::vector<ir::Node *> recv_ops;
+  ir::Node *fetch_bar = nullptr;
+  for (ir::Node *node : Nodes()) {
+    if (node->Name() == "send") {
+      send_ops.push_back(node);
+    } else if (node->Name() == "send_barrier") {
+      PADDLE_ENFORCE(!send_bar, "only has one send barrier");
+      send_bar = node;
+    } else if (node->Name() == "recv") {
+      recv_ops.push_back(node);
+    } else if (node->Name() == "fetch_barrier") {
+      PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier");
+      fetch_bar = node;
+    }
+  }
+  if (send_bar) {
+    for (ir::Node *send : send_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      send->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(send);
+      send_bar->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(send_bar);
+    }
+    for (ir::Node *recv : recv_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      recv->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(recv);
+      send_bar->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(send_bar);
+    }
+  }
+  if (fetch_bar) {
+    for (ir::Node *recv : recv_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      recv->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(recv);
+      fetch_bar->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(fetch_bar);
+    }
+  }
+
+  std::vector<std::string> send_vars = FindDistTrainSendVars(send_ops);
+  std::vector<std::string> recv_vars = FindDistTrainRecvVars(recv_ops);
+  for (ir::Node *node : Nodes()) {
+    if (IsDistTrainOp(node, send_vars, recv_vars)) {
+      if (fetch_bar && node->Name() == "concat") {
+        ir::Node *dep_var = CreateControlDepVar();
+        fetch_bar->outputs.push_back(dep_var);
+        dep_var->inputs.push_back(fetch_bar);
+        node->inputs.push_back(dep_var);
+        dep_var->outputs.push_back(node);
+      }
+    }
+  }
+
+  /**
+   * We should handle write after read(WAR) and write after write(WAW) here.
+   * Because some of the operators of the program can be executed parallelly.
+   * So, to make the program running in the right order, we should add the
+   * dependence of WAR and WAW.
+   *
+   *
+   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
+   */
+
+  for (auto &var : var_nodes) {
+    auto &versions = var.second;
+    if (versions.size() <= 1) continue;
+
+    auto it_new = versions.rbegin();
+    auto it_old = versions.rbegin();
+    ++it_old;
+    for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
+      ir::Node *write_op =
+          (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
+      const auto &read_ops = (*it_old)->outputs;
+
+      PADDLE_ENFORCE(write_op, "The write_op should not be empty.");
+
+      // Add write after write dependence
+      ir::Node *upstream_op =
+          (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
+      if (upstream_op) {
+        ir::Node *dep_var = CreateControlDepVar();
+        write_op->inputs.push_back(dep_var);
+        upstream_op->outputs.push_back(dep_var);
+        dep_var->outputs.push_back(write_op);
+        dep_var->inputs.push_back(upstream_op);
+      }
+
+      for (auto *read_op : read_ops) {
+        // Manually add a dependency var from read_op to write_op;
+        if (read_op == write_op) {
+          // Read Write is the same op.
+          continue;
+        }
+        // 2 ops might have been connected via other vars.
+        bool has_dep = false;
+        for (ir::Node *r_out : read_op->outputs) {
+          for (ir::Node *w_in : write_op->inputs) {
+            if (r_out == w_in) {
+              has_dep = true;
+              break;
+            }
+          }
+        }
+        if (has_dep) continue;
+
+        ir::Node *dep_var = CreateControlDepVar();
+        read_op->outputs.push_back(dep_var);
+        dep_var->inputs.push_back(read_op);
+        write_op->inputs.push_back(dep_var);
+        dep_var->outputs.push_back(write_op);
+      }
+    }
+  }
+}
+
+bool IsControlDepVar(const ir::Node &var) {
+  return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos;
 }
+}  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index b4ac135b02..c9d55fbf52 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -26,27 +26,35 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+namespace ir {
 
 class Graph {
  public:
-  explicit Graph(const ProgramDesc& program);
+  explicit Graph(const ProgramDesc &program);
 
   virtual ~Graph() {
-    for (auto& attr : attrs_) {
+    for (auto &attr : attrs_) {
       attr_dels_[attr.first]();
     }
     attrs_.clear();
     attr_dels_.clear();
   }
 
+  bool Has(const std::string &attr_name) const {
+    return attrs_.find(attr_name) != attrs_.end();
+  }
+
   template <typename AttrType>
-  AttrType& Get(const std::string& attr_name) const {
-    return *boost::any_cast<AttrType*>(attrs_.at(attr_name));
+  AttrType &Get(const std::string &attr_name) const {
+    PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
+                   attr_name);
+    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
   }
 
   template <typename AttrType>
-  void Set(const std::string& attr_name, AttrType* attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+  void Set(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph",
+                   attr_name);
     attrs_[attr_name] = attr;
     attr_dels_[attr_name] = [attr, attr_name]() {
       VLOG(3) << "deleting " << attr_name;
@@ -54,29 +62,70 @@ class Graph {
     };
   }
 
-  ir::Node* CreateVarNode(VarDesc* var_desc) {
-    nodes.emplace_back(new ir::Node(var_desc));
-    return nodes.back().get();
+  const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }
+
+  // Create a normal variable with non-null VarDesc.
+  ir::Node *CreateVarNode(VarDesc *var_desc) {
+    return AddNode(new ir::Node(var_desc));
   }
 
-  ir::Node* CreateOpNode(OpDesc* op_desc) {
-    nodes.emplace_back(new ir::Node(op_desc));
-    return nodes.back().get();
+  // Create a normal runnable operator with OpDesc.
+  ir::Node *CreateOpNode(OpDesc *op_desc) {
+    return AddNode(new ir::Node(op_desc));
   }
 
-  ir::Node* CreateEmptyNode(const std::string& name, ir::Node::Type type) {
-    nodes.emplace_back(new ir::Node(name, type));
-    return nodes.back().get();
+  // Create a control dependency var that connects 2 operations. The
+  // var doesn't hold any data. Other than that, it's no different from
+  // other var, considering dependency analysis.
+  ir::Node *CreateControlDepVar() {
+    // TODO(panyx0718): control var name should be really unique.
+    const std::string name = string::Sprintf(
+        "%s@%llu", ir::Node::kControlDepVarName, node_set_.size());
+    return AddNode(new ir::Node(name, ir::Node::Type::kVariable));
   }
 
-  std::vector<std::unique_ptr<ir::Node>> nodes;
+  // A more free style way of creating a graph node. Mostly use for test
+  // or "copy" from another node. Avoid using it if possible.
+  ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) {
+    return AddNode(new ir::Node(name, type));
+  }
+
+  // Clear all node information of the graph and return the ownership of the
+  // nodes.
+  std::vector<std::unique_ptr<ir::Node>> ReleaseNodes() {
+    std::vector<std::unique_ptr<ir::Node>> ret;
+    for (auto &n : nodes_) {
+      ret.emplace_back(n.second.release());
+    }
+    nodes_.clear();
+    node_set_.clear();
+    return ret;
+  }
 
  private:
+  // This method takes ownership of `node`.
+  ir::Node *AddNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
+    nodes_[node].reset(node);
+    node_set_.insert(node);
+    return node;
+  }
+
+  void RemoveNode(ir::Node *node) {
+    PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
+    node_set_.erase(node);
+    nodes_.erase(node);
+  }
+
   // NOTE: program_ shouldn't be exposed to user.
-  const ProgramDesc& program_;
+  const ProgramDesc &program_;
   std::map<std::string, boost::any> attrs_;
   std::map<std::string, std::function<void(void)>> attr_dels_;
+  std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
+  std::unordered_set<ir::Node *> node_set_;
 };
 
+bool IsControlDepVar(const ir::Node &var);
+}  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
new file mode 100644
index 0000000000..b1c19e6535
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+void SortHelper(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    ir::Node *node, std::unordered_set<ir::Node *> *visited,
+    std::vector<ir::Node *> *ret) {
+  visited->insert(node);
+
+  for (auto adj : adj_list.at(node)) {
+    if (visited->find(adj) == visited->end()) {
+      SortHelper(adj_list, adj, visited, ret);
+    }
+  }
+
+  VLOG(3) << "topology sort insert: " << node->Name()
+          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
+  ret->push_back(node);
+}
+
+bool HasCircleHelper(
+    ir::Node *node,
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    std::unordered_set<ir::Node *> *visited,
+    std::unordered_set<ir::Node *> *in_trace) {
+  if (visited->find(node) == visited->end()) {
+    visited->insert(node);
+    in_trace->insert(node);
+
+    for (ir::Node *in : adj_list.at(node)) {
+      if (visited->find(in) == visited->end() &&
+          HasCircleHelper(in, adj_list, visited, in_trace)) {
+        return true;
+      } else if (in_trace->find(in) != in_trace->end()) {
+        return true;
+      }
+    }
+  }
+  in_trace->erase(node);
+  return false;
+}
+
+bool HasCircleInternal(
+    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list) {
+  std::unordered_set<ir::Node *> visited;
+  std::unordered_set<ir::Node *> in_trace;
+  for (auto &adj : adj_list) {
+    if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace)) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace
+
+bool HasCircle(const Graph &graph) {
+  return HasCircleInternal(BuildOperationAdjList(graph));
+}
+
+std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
+      BuildOperationAdjList(graph);
+  PADDLE_ENFORCE(!HasCircleInternal(adj_list));
+  std::unordered_set<ir::Node *> visited;
+  std::vector<ir::Node *> ret;
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &ret);
+    }
+  }
+  return ret;
+}
+
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
+    const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+
+  for (auto &n : graph.Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    if (adj_list.find(n) == adj_list.end()) {
+      adj_list[n] = std::unordered_set<ir::Node *>();
+    }
+    for (auto &var : n->inputs) {
+      for (auto &adj_n : var->inputs) {
+        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        adj_list[n].insert(adj_n);
+        VLOG(3) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                << "  via " << var->Name() << reinterpret_cast<void *>(var);
+      }
+    }
+  }
+  return adj_list;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
new file mode 100644
index 0000000000..cd6c53a07f
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+// Test if the graph contains circle.
+bool HasCircle(const Graph &graph);
+
+// Topology Sort the operations in the graph from inputs to outputs.
+// `graph` cannot contain circle.
+std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
+
+// Build an adjacency list of operations for the `graph`.
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
+    const Graph &graph);
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
new file mode 100644
index 0000000000..a260dd3da2
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void BuildCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o1->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o1);
+}
+
+void BuildCircleGraph2(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+
+  o2->outputs.push_back(v2);
+  o1->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o1);
+}
+
+void BuildNoCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+
+TEST(GraphHelperTest, Basic) {
+  ProgramDesc prog;
+
+  Graph g(prog);
+  BuildCircleGraph(&g);
+  ASSERT_TRUE(HasCircle(g));
+
+  Graph g2(prog);
+  BuildCircleGraph2(&g2);
+  ASSERT_TRUE(HasCircle(g2));
+
+  auto adj_list = BuildOperationAdjList(g2);
+  for (auto& adj : adj_list) {
+    auto& adj_set = adj.second;
+    if (adj.first->Name() == "op1") {
+      ASSERT_EQ((*adj_set.begin())->Name(), "op2");
+    } else if (adj.first->Name() == "op2") {
+      ASSERT_EQ((*adj_set.begin())->Name(), "op1");
+    } else {
+      ASSERT_TRUE(false);
+    }
+  }
+
+  Graph g3(prog);
+  BuildNoCircleGraph(&g3);
+  ASSERT_FALSE(HasCircle(g3));
+  auto sorted = TopologySortOperations(g3);
+  std::map<std::string, size_t> node_map;
+  for (size_t i = 0; i < sorted.size(); ++i) {
+    node_map[sorted[i]->Name()] = i;
+  }
+  ASSERT_EQ(node_map.at("op1"), 0UL);
+  ASSERT_EQ(node_map.at("op2"), 1UL);
+  ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 4e23bf124f..f9e6bdf362 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -76,6 +76,7 @@ TEST(GraphTest, Basic) {
   op->SetType("sum");
   op->SetInput("X", {"test_a", "test_b", "test_c"});
   op->SetOutput("Out", {"test_out"});
+  op->SetAttr("op_role", 1);
 
   prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS);
   prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS);
@@ -92,21 +93,22 @@ TEST(GraphTest, Basic) {
   ASSERT_EQ(proto::VarType::LOD_TENSOR,
             prog.MutableBlock(0)->Var("test_out")->GetType());
 
-  std::unique_ptr<Graph> g(new Graph(prog));
-  ASSERT_EQ(g->nodes[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[0]->inputs[0]->Name(), "test_a");
-  ASSERT_EQ(g->nodes[0]->inputs[1]->Name(), "test_b");
-  ASSERT_EQ(g->nodes[0]->inputs[2]->Name(), "test_c");
-  ASSERT_EQ(g->nodes[0]->outputs[0]->Name(), "test_out");
-  ASSERT_EQ(g->nodes[1]->Name(), "test_a");
-  ASSERT_EQ(g->nodes[1]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[2]->Name(), "test_b");
-  ASSERT_EQ(g->nodes[2]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[3]->Name(), "test_c");
-  ASSERT_EQ(g->nodes[3]->outputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes[4]->Name(), "test_out");
-  ASSERT_EQ(g->nodes[4]->inputs[0]->Name(), "sum");
-  ASSERT_EQ(g->nodes.size(), 5);
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  std::vector<ir::Node *> nodes(g->Nodes().begin(), g->Nodes().end());
+  for (ir::Node *n : nodes) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->inputs.size(), 3UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
+    } else if (n->Name() == "test_a" || n->Name() == "test_b" ||
+               n->Name() == "test_c") {
+      ASSERT_EQ(n->inputs.size(), 0UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
+    } else if (n->Name() == "test_out") {
+      ASSERT_EQ(n->inputs.size(), 1UL);
+      ASSERT_EQ(n->outputs.size(), 0UL);
+    }
+  }
+  ASSERT_EQ(nodes.size(), 5);
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
new file mode 100644
index 0000000000..8cb812d138
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+static const char kGraphVizPath[] = "graph_viz_path";
+
+std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
+  std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
+  PADDLE_ENFORCE(fout->good());
+  std::ostream& sout = *fout;
+
+  size_t var_id = 0;
+  std::unordered_map<const ir::Node*, size_t> vars;
+
+  sout << "digraph G {\n";
+
+  for (const ir::Node* n : graph->Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kVariable) continue;
+    size_t cur_var_id = var_id++;
+    vars[n] = cur_var_id;
+
+    sout << "var_" << cur_var_id << " [label=\"" << n->Name() << "\"]"
+         << std::endl;
+  }
+
+  size_t op_id = 0;
+  for (const ir::Node* n : graph->Nodes()) {
+    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << n->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : n->inputs) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+
+    for (auto out : n->outputs) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+
+  sout << "}\n";
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
+    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
new file mode 100644
index 0000000000..1fd8c8a26e
--- /dev/null
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class GraphVizPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 86376e7e8b..aca77da8d6 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -15,5 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+namespace ir {
+const char Node::kControlDepVarName[] = "__control_var";
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index b98c29b81d..b3138fccee 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -27,6 +27,8 @@ namespace ir {
 class Node {
  public:
   enum class Type { kOperation, kVariable };
+  static const char kControlDepVarName[];
+
   explicit Node(const std::string& name, Type type)
       : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
 
@@ -50,6 +52,7 @@ class Node {
     PADDLE_ENFORCE(type_ == Type::kVariable);
     return var_desc_;
   }
+
   OpDesc* Op() {
     PADDLE_ENFORCE(type_ == Type::kOperation);
     return op_desc_;
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index c05d7d0bb5..d7158eba62 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -13,7 +13,34 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
-namespace framework {}  // namespace framework
+namespace framework {
+namespace ir {
+std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
+  PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
+  PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
+  for (const std::string& attr : required_pass_attrs_) {
+    PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),
+                   "Required pass atrribute %s not set.", attr);
+  }
+  for (const std::string& attr : required_graph_attrs_) {
+    PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
+                   attr);
+  }
+  auto applied_graph = ApplyImpl(std::move(graph));
+  // TODO(panyx0718): Add more verifications.
+  PADDLE_ENFORCE(!HasCircle(*applied_graph),
+                 "Illegal Pass. Generated graph shouldn't has cycle.");
+  applied_ = true;
+  return applied_graph;
+}
+
+PassRegistry& PassRegistry::Instance() {
+  static PassRegistry g_pass_info_map;
+  return g_pass_info_map;
+}
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index f52ba788d5..0f14083d25 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -14,21 +14,187 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
+#include <map>
+#include <string>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
+template <typename PassType>
+struct PassRegistrar;
 
 class Pass {
  public:
   Pass() = default;
-  virtual ~Pass() {}
+  virtual ~Pass() {
+    for (auto &attr : attrs_) {
+      if (attr_dels_.find(attr.first) != attr_dels_.end()) {
+        attr_dels_[attr.first]();
+      }
+    }
+    attrs_.clear();
+    attr_dels_.clear();
+  }
+
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
+
+  // Get a reference to the attributed previously set.
+  template <typename AttrType>
+  AttrType &Get(const std::string &attr_name) const {
+    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
+                   "%s attr not registered for pass.", attr_name);
+    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+  }
+
+  // Set a pointer to the attribute. Pass takes ownership of the attribute.
+  template <typename AttrType>
+  void Set(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
+                   attr_name);
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = [attr, attr_name]() {
+      VLOG(3) << "deleting " << attr_name;
+      delete attr;
+    };
+  }
+
+  // Set a pointer to the attribute. Pass doesn't take ownership. Caller
+  // should delete the attribute.
+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    attrs_[attr_name] = attr;
+  }
+
+ protected:
+  virtual std::unique_ptr<Graph> ApplyImpl(
+      std::unique_ptr<Graph> graph) const = 0;
+
+ private:
+  template <typename PassType>
+  friend struct PassRegistrar;
+
+  void RegisterRequiredPassAttrs(const std::unordered_set<std::string> &attrs) {
+    required_pass_attrs_.insert(attrs.begin(), attrs.end());
+  }
+
+  void RegisterRequiredGraphAttrs(
+      const std::unordered_set<std::string> &attrs) {
+    required_graph_attrs_.insert(attrs.begin(), attrs.end());
+  }
+
+  mutable bool applied_{false};
+  std::unordered_set<std::string> required_pass_attrs_;
+  std::unordered_set<std::string> required_graph_attrs_;
+  std::map<std::string, boost::any> attrs_;
+  std::map<std::string, std::function<void(void)>> attr_dels_;
+};
+
+using PassCreator = std::function<std::unique_ptr<Pass>()>;
+
+class Registrar {
+ public:
+  // In our design, various kinds of passes,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_PASS macros to
+  // call this method. So, as long as the callee code calls USE_PASS, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
 
-  virtual std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const = 0;
+class PassRegistry {
+ public:
+  static PassRegistry &Instance();
+
+  bool Has(const std::string &pass_type) const {
+    return map_.find(pass_type) != map_.end();
+  }
+
+  void Insert(const std::string &pass_type, const PassCreator &pass_creator) {
+    PADDLE_ENFORCE(!Has(pass_type), "Pass %s has been registered", pass_type);
+    map_.insert({pass_type, pass_creator});
+  }
+
+  std::unique_ptr<Pass> Get(const std::string &pass_type) const {
+    PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered",
+                   pass_type);
+    return map_.at(pass_type)();
+  }
+
+ private:
+  PassRegistry() = default;
+  std::unordered_map<std::string, PassCreator> map_;
+
+  DISABLE_COPY_AND_ASSIGN(PassRegistry);
 };
+
+template <typename PassType>
+struct PassRegistrar : public Registrar {
+  explicit PassRegistrar(const char *pass_type) {
+    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
+                   "'%s' is registered more than once.", pass_type);
+    PassRegistry::Instance().Insert(
+        pass_type, [this]() -> std::unique_ptr<Pass> {
+          std::unique_ptr<Pass> pass(new PassType());
+          pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
+          pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
+          return pass;
+        });
+  }
+
+  PassRegistrar<PassType> &RequirePassAttr(const std::string &attr) {
+    required_pass_attrs_.insert(attr);
+    return *this;
+  }
+
+  PassRegistrar<PassType> &RequireGraphAttr(const std::string &attr) {
+    required_graph_attrs_.insert(attr);
+    return *this;
+  }
+
+ private:
+  std::unordered_set<std::string> required_pass_attrs_;
+  std::unordered_set<std::string> required_graph_attrs_;
+};
+
+#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg)                   \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+// Register a new pass that can be applied on the IR.
+#define REGISTER_PASS(pass_type, pass_class)                          \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+      __reg_pass__##pass_type,                                        \
+      "REGISTER_PASS must be called in global namespace");            \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+      __pass_registrar_##pass_type##__(#pass_type);                   \
+  int TouchPassRegistrar_##pass_type() {                              \
+    __pass_registrar_##pass_type##__.Touch();                         \
+    return 0;                                                         \
+  }                                                                   \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>           \
+      &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \
+          __pass_registrar_##pass_type##__
+
+#define USE_PASS(pass_type)                                           \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
+      __use_pass_itself_##pass_type,                                  \
+      "USE_PASS must be called in global namespace");                 \
+  extern int TouchPassRegistrar_##pass_type();                        \
+  static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \
+      TouchPassRegistrar_##pass_type()
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc
new file mode 100644
index 0000000000..5b5011412e
--- /dev/null
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass.h"
+#include <string>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+void BuildCircleGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+
+  o2->outputs.push_back(v2);
+  o1->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o1);
+}
+
+class TestPass : public Pass {
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
+    graph->Set<int>("copy_test_pass_attr", new int);
+    graph->Set<int>("copy_test_graph_attr", new int);
+
+    int test_pass_attr = this->Get<int>("test_pass_attr");
+    graph->Get<int>("copy_test_pass_attr") = test_pass_attr + 1;
+
+    int test_graph_attr = graph->Get<int>("test_graph_attr");
+    graph->Get<int>("copy_test_graph_attr") = test_graph_attr + 1;
+    return graph;
+  }
+};
+
+TEST(PassTest, TestPassAttrCheck) {
+  ProgramDesc prog;
+  auto pass = PassRegistry::Instance().Get("test_pass");
+  std::unique_ptr<Graph> graph(new Graph(prog));
+  std::string exception;
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("test_pass_attr not set") != exception.npos);
+
+  int val = 1;
+  graph.reset(new Graph(prog));
+  pass->SetNotOwned<int>("test_pass_attr", &val);
+
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("test_graph_attr not set") != exception.npos);
+
+  graph.reset(new Graph(prog));
+  graph->Set<int>("test_graph_attr", new int);
+  graph->Get<int>("test_graph_attr") = 1;
+  graph = pass->Apply(std::move(graph));
+  ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
+  ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
+
+  try {
+    graph = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
+
+  pass = PassRegistry::Instance().Get("test_pass");
+  pass->SetNotOwned<int>("test_pass_attr", &val);
+  graph.reset(new Graph(prog));
+  BuildCircleGraph(graph.get());
+  graph->Set<int>("test_graph_attr", new int);
+  graph->Get<int>("test_graph_attr") = 2;
+  try {
+    auto tmp = pass->Apply(std::move(graph));
+  } catch (paddle::platform::EnforceNotMet e) {
+    exception = std::string(e.what());
+  }
+  ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(test_pass, paddle::framework::ir::TestPass)
+    .RequirePassAttr("test_pass_attr")
+    .RequireGraphAttr("test_graph_attr");
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 71bebeea63..7836ecb127 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <initializer_list>
+#include <memory>
 #include <vector>
 
 #include "paddle/fluid/framework/tensor.h"
@@ -386,13 +387,14 @@ template <typename T>
 class CPUVector : public std::vector<T, std::allocator<T>> {
  public:
   CPUVector() : std::vector<T>() {}
-  CPUVector(size_t count, const T &value = T())
+  CPUVector(size_t count, const T &value = T())  // NOLINT
       : std::vector<T>(count, value) {}
   CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
-  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
-  explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
+  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}  // NOLINT
+  CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
   CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
-  CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector(std::vector<T> &&other)  // NOLINT
+      : std::vector<T>(std::move(other)) {}
   CPUVector &operator=(const CPUVector &other) {
     this->assign(other.begin(), other.end());
     return *this;
@@ -410,8 +412,6 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
     return os;
   }
 
-  void resize(size_t size) { this->resize(size); }
-
   T &operator[](size_t id) { return this->at(id); }
 
   const T &operator[](size_t id) const { return this->at(id); }
diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc
new file mode 100644
index 0000000000..0599c8d384
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <memory>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+
+template <typename T>
+using vec = paddle::framework::Vector<T>;
+
+TEST(mixed_vector, CPU_VECTOR) {
+  vec<int> tmp;
+  for (int i = 0; i < 10; ++i) {
+    tmp.push_back(i);
+  }
+  ASSERT_EQ(tmp.size(), 10UL);
+  vec<int> tmp2;
+  tmp2 = tmp;
+  ASSERT_EQ(tmp2.size(), 10UL);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(tmp2[i], i);
+    ASSERT_EQ(tmp2[i], tmp[i]);
+  }
+  int cnt = 0;
+  for (auto& t : tmp2) {
+    ASSERT_EQ(t, cnt);
+    ++cnt;
+  }
+}
+
+TEST(mixed_vector, InitWithCount) {
+  paddle::framework::Vector<int> vec(10, 10);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(vec[i], 10);
+  }
+}
+
+TEST(mixed_vector, ForEach) {
+  vec<int> tmp;
+  for (auto& v : tmp) {
+    VLOG(3) << v;
+  }
+}
+
+TEST(mixed_vector, Reserve) {
+  paddle::framework::Vector<int> vec;
+  vec.reserve(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
+
+TEST(mixed_vector, Resize) {
+  paddle::framework::Vector<int> vec;
+  vec.resize(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index d57f825108..4b0caa8d35 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -11,7 +11,9 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
+
 #include <cuda_runtime.h>
+#include <memory>
 
 #include "glog/logging.h"
 #include "gtest/gtest.h"
@@ -21,26 +23,6 @@
 template <typename T>
 using vec = paddle::framework::Vector<T>;
 
-TEST(mixed_vector, CPU_VECTOR) {
-  vec<int> tmp;
-  for (int i = 0; i < 10; ++i) {
-    tmp.push_back(i);
-  }
-  ASSERT_EQ(tmp.size(), 10UL);
-  vec<int> tmp2;
-  tmp2 = tmp;
-  ASSERT_EQ(tmp2.size(), 10UL);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(tmp2[i], i);
-    ASSERT_EQ(tmp2[i], tmp[i]);
-  }
-  int cnt = 0;
-  for (auto& t : tmp2) {
-    ASSERT_EQ(t, cnt);
-    ++cnt;
-  }
-}
-
 static __global__ void multiply_10(int* ptr) {
   for (int i = 0; i < 10; ++i) {
     ptr[i] *= 10;
@@ -91,24 +73,3 @@ TEST(mixed_vector, MultiGPU) {
     ASSERT_EQ(tmp[i], i * 100);
   }
 }
-
-TEST(mixed_vector, InitWithCount) {
-  paddle::framework::Vector<int> vec(10, 10);
-  for (int i = 0; i < 10; ++i) {
-    ASSERT_EQ(vec[i], 10);
-  }
-}
-
-TEST(mixed_vector, ForEach) {
-  vec<int> tmp;
-  for (auto& v : tmp) {
-  }
-}
-
-TEST(mixed_vector, Reserve) {
-  paddle::framework::Vector<int> vec;
-  vec.reserve(1);
-  vec.push_back(0);
-  vec.push_back(0);
-  vec.push_back(0);
-}
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index db95861c51..3e17a512ce 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -29,6 +29,13 @@ TEST(OpKernelType, ToString) {
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
             "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
             "CUDNN]");
+
+  using CUDAPlace = paddle::platform::CUDAPlace;
+  OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
+                               LibraryType::kCUDNN);
+  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
+            "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
+            "type[CUDNN]");
 }
 
 TEST(OpKernelType, Hash) {
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 001b5cb5a8..2288c7fe66 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -40,6 +40,40 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
   return OpProtoAndCheckerMaker::VariableBuilder{output};
 }
 
+void OpProtoAndCheckerMaker::Reuse(const std::string& name,
+                                   const std::string& reused_name) {
+  bool found = false;
+  proto::OpProto::Var* var;
+
+  for (auto& var : proto_->inputs()) {
+    if (var.name() == reused_name) {
+      found = true;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found == true,
+                 "Input/Output name: %s reused_name: %s, one of them is not "
+                 "exists or not matched.",
+                 name, reused_name);
+
+  found = false;
+  for (int i = 0; i < proto_->outputs().size(); ++i) {
+    var = proto_->mutable_outputs()->Mutable(i);
+    if (var->name() == name) {
+      PADDLE_ENFORCE(!var->has_reuse(),
+                     "Output(%s) has been set reused var of %s", name,
+                     var->reuse());
+      found = true;
+      var->set_reuse(reused_name);
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found == true,
+                 "Input/Output name: %s reused_name: %s, one of them is not "
+                 "exists or not matched.",
+                 name, reused_name);
+}
+
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   std::unordered_set<std::string> names;
   auto checker = [&](const std::string& name) {
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 92f86bb5de..80970291c9 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -78,6 +78,8 @@ class OpProtoAndCheckerMaker {
   VariableBuilder AddOutput(const std::string &name,
                             const std::string &comment);
 
+  void Reuse(const std::string &name, const std::string &reused_name);
+
   template <typename T>
   TypedAttrChecker<T> &AddAttr(const std::string &name,
                                const std::string &comment,
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index 58f70cb39c..b71c7b6468 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -49,6 +49,15 @@ TEST(ProtoMaker, DuplicatedInOut) {
 }
 
 class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("XOut", "output of test op").Reuse("X");
+  }
+};
+
+class TestInplaceProtoMaker2
+    : public paddle::framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("X", "input of test op");
@@ -58,12 +67,100 @@ class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, InplaceOutput) {
-  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto, op_proto2;
   paddle::framework::OpAttrChecker op_checker;
   TestInplaceProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+  TestInplaceProtoMaker2 proto_maker2;
+
+  proto_maker(&op_proto, &op_checker);
+
+  ASSERT_THROW(proto_maker2(&op_proto2, &op_checker),
                paddle::platform::EnforceNotMet);
-  // proto_maker(&op_proto, &op_checker);
-  // proto_maker.Make();
-  // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
+
+// normal reuse
+class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddInput("Y", "input of test op");
+    AddOutput("Out", "output of test op");
+    AddOutput("XOut", "output of test op");
+    // avoid destructor exception.
+    // Validate();
+    TestReuse();
+  }
+
+  virtual void TestReuse() {}
+};
+
+// test duplicate reuse error
+class TestReuseProtoMaker2 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() {
+    Reuse("Out", "X");
+    Reuse("Out", "Y");
+  }
+};
+
+// NotExists Input
+class TestReuseProtoMaker3 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() {
+    Reuse("Out", "NotExists");
+    Reuse("XOut", "X");
+  }
+};
+
+// NotExists Output
+class TestReuseProtoMaker4 : public TestReuseProtoMaker {
+ public:
+  void TestReuse() { Reuse("NotExists", "X"); }
+};
+
+TEST(ProtoMaker, Reuse) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  TestReuseProtoMaker proto_maker;
+  proto_maker(&op_proto, &op_checker);
+}
+
+// NOTE(dzhwinter):
+// There is a Fatal CHECK on base class destructor, which will call abort inside
+// instead of
+// throw an exception. If we throw an exception in Make(), we will trigger the
+// CHECK and terminate the tests.
+//
+// I had tried to replace the default CHECK with a exception, however, it's
+// still not supported by glog.
+// the details:
+// https://github.com/google/glog/issues/249
+// https://github.com/facebookresearch/TensorComprehensions/issues/351
+/*
+TEST(ProtoMaker, ReuseWithException) {
+  paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
+  paddle::framework::OpAttrChecker op_checker;
+  TestReuseProtoMaker2 proto_maker2;
+  TestReuseProtoMaker3 proto_maker3;
+  TestReuseProtoMaker4 proto_maker4;
+  EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
+               paddle::platform::EnforceNotMet);
+
+  EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
+               paddle::platform::EnforceNotMet);
+
+  EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
+               paddle::platform::EnforceNotMet);
+}
+
+void FailureFunction() {
+  throw std::runtime_error("Check failed in destructor.");
+  // return 0;
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  google::InstallFailureFunction(&FailureFunction);
+  return RUN_ALL_TESTS();
+}
+*/
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d1dc5fcd97..d04f774496 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 
   if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().dims();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return DDim({-1});
+    }
+    return tensor.dims();
   } else if (var->IsType<SelectedRows>()) {
     if (get_actual_dim) {
       return var->Get<SelectedRows>().value().dims();
@@ -69,6 +74,26 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 }
 
+static std::string GetDtype(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return "";
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return "";
+    }
+    return DataTypeToString(ToDataType(tensor.type()));
+  } else if (var->IsType<SelectedRows>()) {
+    return DataTypeToString(
+        ToDataType(var->Get<SelectedRows>().value().type()));
+  } else {
+    return "";
+  }
+}
+
 static int GetRowSize(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
@@ -91,14 +116,18 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
   }
 
   if (var->IsType<LoDTensor>()) {
-    return var->Get<LoDTensor>().lod();
+    const LoDTensor& tensor = var->Get<LoDTensor>();
+    if (UNLIKELY(!tensor.IsInitialized())) {
+      return default_lod;
+    }
+    return tensor.lod();
   } else {
     return default_lod;
   }
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(10) << "- " << DebugStringEx(&scope);
+  VLOG(4) << place << " " << DebugStringEx(&scope);
   if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
     PADDLE_THROW("Cannot run operator on place %s", place);
@@ -107,8 +136,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     platform::SetDeviceId(dev_id);
 #endif
   }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::RecordEvent record_event(Type(), pool.Get(place));
   RunImpl(scope, place);
-  VLOG(10) << "+ " << DebugStringEx(&scope);
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -172,6 +203,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
         if (row_size >= 0) {
           ss << "[row_size=" << row_size << "]";
         }
+        std::string dtype = GetDtype(*scope, input.second[i]);
+        ss << ":" << dtype;
         ss << "[" << GetDims(*scope, input.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
@@ -608,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
-  // For profiling, don't move out of this function because that will result
-  // in the failure of multi-GPU profiling.
-  platform::RecordEvent record_event(Type(), dev_ctx);
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
@@ -679,6 +709,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
         CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      } else if (var->IsType<framework::SelectedRows>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
       }
     }
   }
@@ -746,6 +778,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
   auto& scope = ctx.scope();
   int data_type = -1;
+  std::string last_input_name;
   for (auto& input : this->inputs_) {
     for (auto& ipt_name : input.second) {
       auto* var = scope.FindVar(ipt_name);
@@ -762,9 +795,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           int tmp = static_cast<int>(ToDataType(t->type()));
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
-              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
-              data_type, tmp);
+              "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)",
+              Type(), last_input_name, data_type, ipt_name, tmp);
           data_type = tmp;
+          last_input_name = ipt_name;
         }
       }
     }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 1e5bba62b5..275cb8c592 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -19,19 +19,80 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
 
+std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
+    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &param_names,
+    const std::vector<Scope *> &local_scopes, const bool use_cuda,
+#ifdef PADDLE_WITH_CUDA
+    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
+#else
+    const BuildStrategy &strategy) {
+#endif
+  // Convert the program to graph.
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+
+  // Apply a graph viz pass to record a graph.
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
+    const std::string graph_path = string::Sprintf(
+        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
+    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    graph = viz_pass->Apply(std::move(graph));
+  }
+
+  // Convert graph to run on multi-devices.
+  auto multi_devices_pass =
+      ir::PassRegistry::Instance().Get("multi_devices_pass");
+  multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
+                                                                      &places);
+  multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
+                                                     &loss_var_name);
+  multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
+      "params", &param_names);
+  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                              &local_scopes);
+  multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+  multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+#endif
+  graph = multi_devices_pass->Apply(std::move(graph));
+
+  // Apply a graph print pass to record a graph with device info.
+  if (!strategy.debug_graphviz_path_.empty()) {
+    auto multi_devices_print_pass =
+        ir::PassRegistry::Instance().Get("multi_devices_print_pass");
+    multi_devices_print_pass->SetNotOwned<const std::string>(
+        "debug_graphviz_path", &strategy.debug_graphviz_path_);
+    multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
+        "graph_printer", new details::GraphvizSSAGraphPrinter);
+    graph = multi_devices_print_pass->Apply(std::move(graph));
+  }
+
+  // Verify that the graph is correct for multi-device executor.
+  auto multi_devices_check_pass =
+      ir::PassRegistry::Instance().Get("multi_devices_check_pass");
+  graph = multi_devices_check_pass->Apply(std::move(graph));
+  return graph;
+}
+
 class ParallelExecutorPrivate {
  public:
   explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
@@ -119,21 +180,19 @@ ParallelExecutor::ParallelExecutor(
     var_infos.back().persistable_ = var->Persistable();
   }
 
-  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  details::SSAGraphBuilderFactory builder_factory(
-      member_->places_, loss_var_name, params, member_->local_scopes_,
-      build_strategy);
-  if (member_->use_cuda_) {
+// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+// ncclOp
 #ifdef PADDLE_WITH_CUDA
-    builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
+  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+      main_program, member_->places_, loss_var_name, params,
+      member_->local_scopes_, member_->use_cuda_, build_strategy,
+      member_->nccl_ctxs_.get());
 #else
-    PADDLE_THROW("Not compiled with CUDA.");
+  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+      main_program, member_->places_, loss_var_name, params,
+      member_->local_scopes_, member_->use_cuda_, build_strategy);
 #endif
-  }
-  builder_ = builder_factory.Create();
-  std::unique_ptr<Graph> graph(new Graph(main_program));
-  graph = builder_->Apply(std::move(graph));
+
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, places, std::move(graph)));
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
@@ -146,11 +205,18 @@ void ParallelExecutor::BCastParamsToDevices(
   // the initializing bcast, all vars would be bcast from device(0),
   // otherwise
   // bcast from the specified device.
-  bool initializing = builder_.get() == nullptr ? true : false;
-
+  bool initializing = member_->executor_ ? false : true;
   for (auto &var : vars) {
-    int var_dev_id =
-        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
+    int var_dev_id = -1;
+    if (member_->executor_) {
+      auto &sharded_var_device =
+          member_->executor_->Graph().Get<details::ShardedVarDevice>(
+              details::kShardedVarDevice);
+      if (sharded_var_device.find(var) != sharded_var_device.end()) {
+        var_dev_id = sharded_var_device.at(var);
+      }
+    }
+
     if (!initializing && var_dev_id == -1) continue;
 
     framework::Variable *main_var = nullptr;
@@ -286,3 +352,8 @@ ParallelExecutor::~ParallelExecutor() {
 
 }  // namespace framework
 }  // namespace paddle
+
+USE_PASS(graph_viz_pass);
+USE_PASS(multi_devices_pass);
+USE_PASS(multi_devices_check_pass);
+USE_PASS(multi_devices_print_pass);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ffb9934a2d..5fb748fa20 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -70,7 +70,6 @@ class ParallelExecutor {
 
  private:
   ParallelExecutorPrivate *member_;
-  std::unique_ptr<details::SSAGraphBuilder> builder_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index ef224d68f1..0bbfd66148 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -82,7 +82,7 @@ class Tensor {
   template <typename T>
   const T* data() const;
 
-  bool IsInitialized() const;
+  inline bool IsInitialized() const;
 
   /**
    * @brief   Return a pointer to mutable memory block.
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 0a1cb6d570..cb2061c06a 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/tensor.h"
 #include <gtest/gtest.h>
 #include <string>
+#include "paddle/fluid/platform/float16.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
@@ -213,3 +214,17 @@ TEST(Tensor, Layout) {
   src.set_layout(framework::DataLayout::kAnyLayout);
   ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
+
+TEST(Tensor, FP16) {
+  using platform::float16;
+  framework::Tensor src;
+  float16* src_ptr = src.mutable_data<float16>({2, 3}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3; ++i) {
+    src_ptr[i] = static_cast<float16>(i);
+  }
+  EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16));
+  // EXPECT a human readable error message
+  // src.data<uint8_t>();
+  // Tensor holds the wrong type, it holds N6paddle8platform7float16E at
+  // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43]
+}
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 16c9c81258..ba7645aa02 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -14,8 +14,15 @@ cc_library(paddle_fluid_api
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
+# paddle_fluid_origin exclude inference api interface
+cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+
+if(NOT APPLE)
+  add_subdirectory(api)
+endif()
+
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
@@ -24,7 +31,7 @@ endif()
 
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
-    SRCS io.cc
+    SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
     DEPS ${fluid_modules} paddle_fluid_api)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
@@ -32,12 +39,21 @@ if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
   set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  # check symbol hidden
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n"
+    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
+    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
+    "endif()\n")
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
+    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
+    DEPENDS paddle_fluid_shared)
+  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()
 
 if(WITH_TESTING)
   # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
 endif()
-if(NOT APPLE)
-  add_subdirectory(api)
-endif()
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 67d355d10d..27fe575cb6 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
   tensorrt_subgraph_node_mark_pass.cc
   analyzer.cc
   helper.cc
+        model_store_pass.cc
   DEPS framework_proto proto_desc)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
+cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)
 
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
@@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
+inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index b3a1075e5a..c4ab26a228 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -17,18 +17,22 @@
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
 
 namespace paddle {
 
-DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true,
             "Enable subgraph to TensorRT engine for acceleration");
 
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
               "Graphviz debuger for data flow graphs.");
 
+DEFINE_string(inference_analysis_output_storage_path, "",
+              "optimized model output path");
+
 namespace inference {
 namespace analysis {
 
@@ -38,15 +42,27 @@ class DfgPassManagerImpl final : public DfgPassManager {
     // TODO(Superjomn) set the key with pass reprs.
     AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
     if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
-      auto trt_teller = [](const Node* node) {
+      auto trt_teller = [&](const Node* node) {
+        std::unordered_set<std::string> teller_set(
+            {"elementwise_add", "mul", "conv2d", "pool2d", "relu"});
         if (!node->IsFunction()) return false;
-        return static_cast<const Function*>(node)->func_type() == "mul";
+
+        const auto* func = static_cast<const Function*>(node);
+        if (teller_set.count(func->func_type()))
+          return true;
+        else {
+          return false;
+        }
       };
+
       AddPass("tensorrt-subgraph-marker",
               new TensorRTSubgraphNodeMarkPass(trt_teller));
       AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
     }
     AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+    if (!FLAGS_inference_analysis_output_storage_path.empty()) {
+      AddPass("model-store-pass", new ModelStorePass);
+    }
   }
 
   std::string repr() const override { return "dfg-pass-manager"; }
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 0132bf5b9c..c82fdfff86 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -16,28 +16,23 @@ limitations under the License. */
 
 /*
  * This file contains Analyzer, an class that exposed as a library that analyze
- * and optimize
- * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
- * control whether
- * an process is applied on the program.
+ * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has
+ * multiple flags to
+ * control whether an process is applied on the program.
  *
  * The processes are called Passes in analysis, the Passes are placed in a
- * pipeline, the first
- * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
- * a data flow
- * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
- * graph to a
- * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
- * which take a
- * node or data flow graph as input.
+ * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a
+ * Fluid ProgramDesc to
+ * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms
+ * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the
+ * pipeline can be any Passes
+ * which take a node or data flow graph as input.
  *
  * The Analyzer can be used in two methods, the first is a executable file which
- * can be used to
- * pre-process the inference model and can be controlled by passing difference
- * command flags;
+ * can be used to pre-process the inference model and can be controlled by
+ * passing difference command flags;
  * the other way is to compose inside the inference API as a runtime pre-process
- * phase in the
- * inference service.
+ * phase in the inference service.
  */
 
 #include <gflags/gflags.h>
@@ -50,6 +45,7 @@ namespace paddle {
 // flag if not available.
 DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
 DECLARE_string(inference_analysis_graphviz_log_root);
+DECLARE_string(inference_analysis_output_storage_path);
 
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/analysis/analyzer_main.cc
new file mode 100644
index 0000000000..5e1fe3eb79
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_main.cc
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file implements analysizer -- an executation help to analyze and
+ * optimize trained model.
+ */
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  using paddle::inference::analysis::Analyzer;
+  using paddle::inference::analysis::Argument;
+
+  Argument argument;
+  Analyzer analyzer;
+  analyzer.Run(&argument);
+
+  return 0;
+}
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 25a440e7e7..24bfb3993c 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -20,14 +20,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, analysis_without_tensorrt) {
+TEST(Analyzer, analysis_without_tensorrt) {
   FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
 
-TEST_F(DFG_Tester, analysis_with_tensorrt) {
+TEST(Analyzer, analysis_with_tensorrt) {
   FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
+  Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   Analyzer analyser;
   analyser.Run(&argument);
 }
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 6d316f20bf..a17d6281a2 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,6 +23,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 
@@ -36,6 +37,16 @@ namespace analysis {
  * All the fields should be registered here for clearness.
  */
 struct Argument {
+  Argument() = default;
+  explicit Argument(const std::string& fluid_model_dir)
+      : fluid_model_dir(new std::string(fluid_model_dir)) {}
+  // The directory of the trained model.
+  std::unique_ptr<std::string> fluid_model_dir;
+  // The path of `__model__` and `param`, this is used when the file name of
+  // model and param is changed.
+  std::unique_ptr<std::string> fluid_model_program_path;
+  std::unique_ptr<std::string> fluid_model_param_path;
+
   // The graph that process by the Passes or PassManagers.
   std::unique_ptr<DataFlowGraph> main_dfg;
 
@@ -44,6 +55,9 @@ struct Argument {
 
   // The processed program desc.
   std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
+
+  // The output storage path of ModelStorePass.
+  std::unique_ptr<std::string> model_output_store_path;
 };
 
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index 8a3af0a8eb..7f64bc75ae 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -337,6 +337,34 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
                         std::vector<Node *>(outputs.begin(), outputs.end()));
 }
 
+void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
+  std::vector<Node *> op_nodes;
+  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+    if (node.type() == Node::Type::kValue || node.deleted()) {
+      continue;
+    }
+    op_nodes.push_back(&node);
+  }
+  size_t op_num = op_nodes.size();
+  for (size_t i = 0; i < op_num; i++) {
+    if (op_nodes[i]->type() == Node::Type::kFunction) continue;
+    std::unordered_set<std::string> follow_up_input_names;
+    for (size_t j = i + 1; j < op_num; j++) {
+      for (auto *in : op_nodes[j]->inlinks) {
+        follow_up_input_names.insert(in->name());
+      }
+    }
+    std::vector<Node *> filtered_subgraph_outlinks;
+    for (auto *out : op_nodes[i]->outlinks) {
+      if (follow_up_input_names.count(out->name())) {
+        filtered_subgraph_outlinks.push_back(out);
+      }
+    }
+    PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
+    op_nodes[i]->outlinks = filtered_subgraph_outlinks;
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 1c60d5de21..bb3ec6bbc1 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -36,6 +36,8 @@ namespace analysis {
 
 /*
  * DataFlowGraph - A container of Value and Function Nodes.
+ *
+ * This is the base graph for any other type of graphs, such as SSA or CFG.
  */
 struct DataFlowGraph {
   NodeMap nodes;
@@ -174,8 +176,9 @@ struct GraphTraits<DataFlowGraph> {
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
 std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);  // NOLINT
 
+void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph);
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 7912f8d7f1..a881262665 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -20,7 +20,7 @@ namespace inference {
 namespace analysis {
 
 TEST(DataFlowGraph, BFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
 
@@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) {
 }
 
 TEST(DataFlowGraph, DFS) {
-  auto desc = LoadProgramDesc();
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   dfg.Build();
   GraphTraits<DataFlowGraph> trait(&dfg);
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index 2328d87042..18c32fa091 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -23,7 +23,7 @@
 namespace paddle {
 namespace inference {
 
-DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_max_batchsize, 3, "TensorRT maximum batch size");
 DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
 
 namespace analysis {
@@ -52,6 +52,7 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
 bool DataFlowGraphToFluidPass::Finalize() { return true; }
 
 void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
+  FilterRedundantOutputOfSubGraph(graph);
   LOG(INFO) << "graph.inputs " << graph->inputs.size();
   for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
     if (node.deleted()) continue;
@@ -87,34 +88,113 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
 }
 
 void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
-                       const framework::proto::BlockDesc &block) {
+                       framework::proto::BlockDesc *block) {
   static int counter{0};
   PADDLE_ENFORCE(node->IsFunctionBlock());
   framework::OpDesc desc;
   auto *func = static_cast<FunctionBlock *>(node);
 
   // collect inputs
-  std::vector<std::string> io;
+  std::unordered_set<std::string> input_names;
   for (auto *x : func->inlinks) {
-    io.push_back(x->name());
+    input_names.insert(x->name());
   }
-  desc.SetInput("Xs", io);
+  desc.SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
-  // collect outputs
-  io.clear();
+  std::unordered_set<std::string> output_names;
   for (auto *x : func->outlinks) {
-    io.push_back(x->name());
+    output_names.insert(x->name());
   }
-  desc.SetOutput("Ys", io);
+
+  std::vector<std::string> output_temp(output_names.begin(),
+                                       output_names.end());
+  desc.SetOutput("Ys", output_temp);
   desc.SetType("tensorrt_engine");
 
-  PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
+  std::unordered_map<std::string, std::string> output_name_map;
+
+  // The following procedure is used to rename all the intermediate
+  // variables and the output variables of the subgraph.
+  // Why we do this?
+  // During the transition from fluid OP to tensorrt OP, we map
+  // the input and output Tensor(fluid data structure) of fluid OP
+  // to the correspondin ITensor (trt data structure) through the
+  // Tensor name. When we set up ITensor for an variable, we must
+  // ensure that it has not been set before.
+  // If there is variable in the fluid graph, which is not only the
+  // input of a OP, but also the output of a Op, there will be problems.
+  // So we have to rename the variable in the subgraph to make sure
+  // it is either an OP's input or an OP's output.
+
+  auto subgraph_nodes = func->subgraph;
+  for (int index = 0; index < block->ops_size(); index++) {
+    framework::proto::OpDesc *op = block->mutable_ops(index);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    for (auto *in_var : correspond_node->inlinks) {
+      var2id[in_var->name()] = in_var->id();
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      framework::proto::OpDesc_Var *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {
+        std::string arg_value = in_var->arguments(k);
+        if (input_names.count(arg_value)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value +
+                                   std::to_string(var2id[arg_value]));
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outlinks) {
+      var2id[out_var->name()] = out_var->id();
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        if (output_names.count(arg_value)) {
+          output_name_map[arg_value] =
+              arg_value + std::to_string(var2id[arg_value]);
+        }
+        replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+  // When tensorrt engine runs at the end of the operation,
+  // output_mapping help us copy the data from the renamed ITensor
+  // to Tensor.
+  std::vector<std::string> output_mapping;
+  for (auto name : output_names) {
+    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    output_mapping.push_back(output_name_map[name]);
+  }
+
+  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
   // Set attrs
-  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
+  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
   SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
   SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
   SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
   SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
   node->SetPbMsg(desc.Proto()->SerializeAsString());
 }
 
@@ -146,15 +226,17 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
   LOG(INFO) << "transformed variable size: "
             << block_desc.Proto()->vars().size();
   // copy ops.
+
   for (auto *node : block_node->subgraph) {
     auto *op = block_desc.AppendOp();
     PADDLE_ENFORCE(!node->pb_msg().empty());
     op->Proto()->ParseFromString(node->pb_msg());
   }
+
   *block_desc.Proto()->mutable_vars() =
       argument_->origin_program_desc->blocks(0).vars();
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
+  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
   auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
   auto *op = main_block->add_ops();
   PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index d8fc5e580a..4ef381db29 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -26,21 +26,21 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Test) {
-  DataFlowGraph graph;
+TEST(DataFlowGraph, Test) {
+  Argument argument(FLAGS_inference_model_dir);
 
   FluidToDataFlowGraphPass pass0;
   DataFlowGraphToFluidPass pass1;
   ASSERT_TRUE(pass0.Initialize(&argument));
   ASSERT_TRUE(pass1.Initialize(&argument));
 
-  pass0.Run(&graph);
-  pass1.Run(&graph);
+  pass0.Run(argument.main_dfg.get());
+  pass1.Run(argument.main_dfg.get());
 
   pass0.Finalize();
   pass1.Finalize();
 
-  LOG(INFO) << graph.nodes.size();
+  LOG(INFO) << argument.main_dfg->nodes.size();
 }
 
 };  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
index a6f8548475..c05b0e5d46 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -46,9 +46,9 @@ std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
   for (size_t i = 0; i < graph->nodes.size(); i++) {
     const Node &node = graph->nodes.Get(i);
     if (!config_.display_deleted_node && node.deleted()) continue;
-    for (auto &in : node.inlinks) {
-      if (!config_.display_deleted_node && in->deleted()) continue;
-      dot.AddEdge(in->repr(), node.repr(), {});
+    for (auto &out : node.outlinks) {
+      if (!config_.display_deleted_node && out->deleted()) continue;
+      dot.AddEdge(node.repr(), out->repr(), {});
     }
   }
   return dot.Build();
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index 65842b1e85..928be79170 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -23,12 +23,18 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
-  auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) {
+  Argument argument(FLAGS_inference_model_dir);
+  FluidToDataFlowGraphPass pass0;
+  ASSERT_TRUE(pass0.Initialize(&argument));
+  pass0.Run(argument.main_dfg.get());
+
+  // auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+
   DFG_GraphvizDrawPass::Config config("./", "test");
   DFG_GraphvizDrawPass pass(config);
   pass.Initialize(&argument);
-  pass.Run(&dfg);
+  pass.Run(argument.main_dfg.get());
 
   // test content
   std::ifstream file("./0-graph_test.dot");
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 496921db9e..511631d3e0 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <glog/logging.h>
 #include <string>
 #include <vector>
 
@@ -25,8 +26,20 @@ namespace analysis {
 
 bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
   ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
-  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
-  PADDLE_ENFORCE(argument);
+  if (argument->origin_program_desc) {
+    LOG(WARNING) << "argument's origin_program_desc is already set, might "
+                    "duplicate called";
+  }
+  if (!argument->fluid_model_program_path) {
+    ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir);
+    argument->fluid_model_program_path.reset(
+        new std::string(*argument->fluid_model_dir + "/__model__"));
+  }
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path);
+  auto program = LoadProgramDesc(*argument->fluid_model_program_path);
+  argument->origin_program_desc.reset(
+      new framework::proto::ProgramDesc(program));
+
   if (!argument->main_dfg) {
     argument->main_dfg.reset(new DataFlowGraph);
   }
@@ -40,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   PADDLE_ENFORCE(graph);
   PADDLE_ENFORCE(desc_);
   // insert vars
+  // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id
+  // will keep updating to its latest alias during the graph-building.
   std::unordered_map<std::string, size_t> var2id;
   auto &main_block = desc_->blocks(framework::kRootBlockIndex);
   for (int i = 0; i < main_block.vars_size(); i++) {
@@ -51,6 +66,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     var2id[var.name()] = v->id();
   }
 
+  // The variables in a SSA can only write once, so if a variable is written
+  // multiple times(quite common in our ProgramDesc design), multiple alias
+  // Nodes of this variable will be created, and each will just write once.
+
+  // An set that keep all the names of the variables(the original, not alias)
+  // that have been written(as outputs). Once an Op's output variable hit the
+  // set, it should create a new alias and update the global alias for this
+  // variable. And that make a Data Flow Graph a SSA.
+  std::unordered_set<Node *> unique_written_vars;
   for (int i = 0; i < main_block.ops_size(); i++) {
     const auto &op = main_block.ops(i);
     auto *o = graph->nodes.Create(Node::Type::kFunction);
@@ -62,33 +86,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     o->SetPbMsg(op.SerializeAsString());
 
     // set inputs and outputs
-    std::unordered_set<Node *> inlinks;
     for (int j = 0; j < op.inputs_size(); j++) {
       auto &in_var = op.inputs(j);
       for (int k = 0; k < in_var.arguments_size(); k++) {
         auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
         in->outlinks.push_back(o);
         o->inlinks.push_back(in);
-        inlinks.insert(in);
       }
     }
     for (int j = 0; j < op.outputs_size(); j++) {
       auto &out_var = op.outputs(j);
       for (int k = 0; k < out_var.arguments_size(); k++) {
         auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
-        if (inlinks.count(out)) {
+        if (unique_written_vars.count(out)) {
           // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
           auto *out_alias = graph->nodes.Create(Node::Type::kValue);
           out_alias->SetName(out->name());
           out_alias->SetPbDesc(out->pb_desc());
           out_alias->SetPbMsg(out->pb_msg());
-          var2id[out_alias->name()] = out_alias->id();  // update a -> a0
+          var2id[out_alias->name()] =
+              out_alias->id();  // update variable's alias Node
           LOG(INFO) << "loop found in graph, create SSA alias node ["
                     << out_alias->repr() << "] for [" << out->repr() << "]";
           out = out_alias;
         }
         out->inlinks.push_back(o);
         o->outlinks.push_back(out);
+        unique_written_vars.insert(out);
       }
     }
   }
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index da8463b63b..fb948bf224 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -30,7 +30,7 @@ namespace inference {
 namespace analysis {
 
 /*
- * Transform a FluidDesc to a data flow graph.
+ * Transform a FluidDesc to a SSA.
  */
 class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
  public:
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index dadb84059d..d218dcd050 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -21,8 +21,9 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, Init) {
+TEST(FluidToDataFlowGraphPass, Test) {
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   pass.Initialize(&argument);
   pass.Run(argument.main_dfg.get());
   // Analysis is sensitive to ProgramDesc, careful to change the original model.
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index f1064cd20f..a0f912b251 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cstdio>
+#include <fstream>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
@@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) {
   }
 }
 
+static framework::proto::ProgramDesc LoadProgramDesc(
+    const std::string &model_path) {
+  std::ifstream fin(model_path, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
+  fin.seekg(0, std::ios::end);
+  std::string buffer(fin.tellg(), ' ');
+  fin.seekg(0, std::ios::beg);
+  fin.read(&buffer[0], buffer.size());
+  fin.close();
+  framework::proto::ProgramDesc program_desc;
+  program_desc.ParseFromString(buffer);
+  return program_desc;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc
new file mode 100644
index 0000000000..1c42917642
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void ModelStorePass::Run(DataFlowGraph *x) {
+  if (!argument_->fluid_model_param_path) {
+    PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir);
+    argument_->fluid_model_param_path.reset(
+        new std::string(*argument_->fluid_model_dir + "param"));
+  }
+  PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path);
+  // Directly copy param file to destination.
+  std::stringstream ss;
+  // NOTE these commands only works on linux.
+  ss << "mkdir -p " << *argument_->model_output_store_path;
+  LOG(INFO) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+  ss.str("");
+
+  ss << "cp " << *argument_->fluid_model_dir << "/*"
+     << " " << *argument_->model_output_store_path;
+  LOG(INFO) << "run command: " << ss.str();
+  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
+
+  // Store program
+  PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
+                          "program desc is not transformed, should call "
+                          "DataFlowGraphToFluidPass first.");
+  const std::string program_output_path =
+      *argument_->model_output_store_path + "/__model__";
+  std::ofstream file(program_output_path, std::ios::binary);
+  PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.",
+                 program_output_path);
+  const std::string serialized_message =
+      argument_->transformed_program_desc->SerializeAsString();
+  file.write(serialized_message.c_str(), serialized_message.size());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h
new file mode 100644
index 0000000000..fac7083925
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines ModelStorePass, which store the runtime DFG to a Paddle
+ * model in the disk, and that model can be reloaded for prediction.
+ */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class ModelStorePass : public DataFlowGraphPass {
+ public:
+  bool Initialize(Argument* argument) override {
+    if (!argument) {
+      LOG(ERROR) << "invalid argument";
+      return false;
+    }
+    argument_ = argument;
+    return true;
+  }
+
+  void Run(DataFlowGraph* x) override;
+
+  std::string repr() const override { return "DFG-store-pass"; }
+  std::string description() const override {
+    return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle
+    model in the disk, and that model can be reloaded for prediction again.)DD";
+  }
+
+ private:
+  Argument* argument_{nullptr};
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
new file mode 100644
index 0000000000..5f3526dd50
--- /dev/null
+++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/model_store_pass.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(inference_model_dir, "", "Model path");
+
+TEST(DFG_StorePass, test) {
+  Analyzer analyzer;
+  Argument argument(FLAGS_inference_model_dir);
+  argument.model_output_store_path.reset(
+      new std::string("./_dfg_store_pass_tmp"));
+  // disable storage in alalyzer
+  FLAGS_inference_analysis_output_storage_path = "";
+  analyzer.Run(&argument);
+
+  ModelStorePass pass;
+  pass.Initialize(&argument);
+  pass.Run(argument.main_dfg.get());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 6b4dbb3bb5..6806f9ff7d 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -50,6 +50,7 @@ class Pass {
   // Create a debugger Pass that draw the DFG by graphviz toolkit.
   virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
 
+  virtual void Run() { LOG(FATAL) << "not valid"; }
   // Run on a single Node.
   virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
   // Run on a single Function.
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index dac1c509d7..13423e4837 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -56,7 +56,7 @@ class TestNodePass final : public NodePass {
   std::string description() const override { return "some doc"; }
 };
 
-TEST_F(DFG_Tester, DFG_pass_manager) {
+TEST(PassManager, DFG_pass_manager) {
   TestDfgPassManager manager;
   DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
 
@@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) {
   manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
   manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
 
+  Argument argument(FLAGS_inference_model_dir);
+
   ASSERT_TRUE(&argument);
   ASSERT_TRUE(manager.Initialize(&argument));
   manager.RunAll();
 }
 
-TEST_F(DFG_Tester, Node_pass_manager) {
+TEST(PassManager, Node_pass_manager) {
+  Argument argument(FLAGS_inference_model_dir);
   // Pre-process: initialize the DFG with the ProgramDesc first.
   FluidToDataFlowGraphPass pass0;
   pass0.Initialize(&argument);
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 389f9e1a91..80809d4c43 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
 
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
   std::vector<Node *> marked_nodes;
-  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes_in_TS()) {
     if (node.attr(kMarkerAttrName).Bool()) {
       marked_nodes.push_back(&node);
     }
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 67dd4da54b..39cc433b40 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
   return false;
 };
 
-TEST_F(DFG_Tester, Split) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Split) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
   LOG(INFO) << "spliter\n" << dfg.DotString();
 
@@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) {
   ASSERT_EQ(subgraphs.back().size(), 6UL);
 }
 
-TEST_F(DFG_Tester, Fuse) {
-  auto desc = LoadProgramDesc();
+TEST(SubGraphSplitter, Fuse) {
+  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
   auto dfg = ProgramDescToDFG(desc);
 
   size_t count0 = dfg.nodes.size();
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
index a6c15e848b..c1d932878e 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -22,11 +22,11 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+TEST(TensorRTSubgraphNodeMarkPass, test) {
   // init
   FluidToDataFlowGraphPass pass;
+  Argument argument(FLAGS_inference_model_dir);
   ASSERT_TRUE(pass.Initialize(&argument));
-  argument.main_dfg.reset(new DataFlowGraph);
   pass.Run(argument.main_dfg.get());
 
   TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
@@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
   for (auto& node : argument.main_dfg->nodes.nodes()) {
     counter += node->attr(ATTR_supported_by_tensorrt).Bool();
   }
-
+  ASSERT_EQ(counter, 2);
   LOG(INFO) << counter << " nodes marked";
 }
 
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
index 1d749d3fa3..67a5af83d8 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -25,7 +25,7 @@ namespace analysis {
 
 DEFINE_string(dot_dir, "./", "");
 
-TEST_F(DFG_Tester, tensorrt_single_pass) {
+TEST(TensorRTSubGraphPass, main) {
   std::unordered_set<std::string> teller_set(
       {"elementwise_add", "mul", "sigmoid"});
   SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
@@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
     return false;
   };
 
-  LOG(INFO) << "init";
+  Argument argument(FLAGS_inference_model_dir);
+
   DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
   DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
 
@@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) {
   FluidToDataFlowGraphPass pass0;
   TensorRTSubGraphPass trt_pass(std::move(teller));
 
-  LOG(INFO) << "Initialize";
   dfg_pass.Initialize(&argument);
   dfg_pass1.Initialize(&argument);
   pass0.Initialize(&argument);
   trt_pass.Initialize(&argument);
 
-  LOG(INFO) << "Run";
   argument.main_dfg.reset(new DataFlowGraph);
   pass0.Run(argument.main_dfg.get());
   dfg_pass.Run(argument.main_dfg.get());
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index ce1191a567..1073a6f686 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 
 namespace paddle {
 namespace inference {
@@ -32,27 +32,12 @@ namespace analysis {
 
 DEFINE_string(inference_model_dir, "", "inference test model dir");
 
-static framework::proto::ProgramDesc LoadProgramDesc(
-    const std::string& model_dir = FLAGS_inference_model_dir) {
-  std::string msg;
-  std::string net_file = FLAGS_inference_model_dir + "/__model__";
-  std::ifstream fin(net_file, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
-  fin.seekg(0, std::ios::end);
-  msg.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(msg.at(0)), msg.size());
-  fin.close();
-  framework::proto::ProgramDesc program_desc;
-  program_desc.ParseFromString(msg);
-  return program_desc;
-}
-
 static DataFlowGraph ProgramDescToDFG(
     const framework::proto::ProgramDesc& desc) {
   DataFlowGraph graph;
   FluidToDataFlowGraphPass pass;
   Argument argument;
+  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
   argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   pass.Initialize(&argument);
   pass.Run(&graph);
@@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG(
 class DFG_Tester : public ::testing::Test {
  protected:
   void SetUp() override {
-    auto desc = LoadProgramDesc(FLAGS_inference_model_dir);
+    auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
     argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
   }
 
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 7e4b3e9a2d..08d0f493ab 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -19,6 +19,7 @@ endif(APPLE)
 
 
 set(inference_deps paddle_inference_api paddle_fluid_api)
+
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
 endif()
@@ -42,35 +43,8 @@ function(inference_api_test TARGET_NAME)
     endif(WITH_TESTING)
 endfunction(inference_api_test)
 
-cc_library(paddle_inference_api
-    SRCS api.cc api_impl.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-if(NOT APPLE)
-  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/api.sym")
-  set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-endif()
-
-# Here the shared library doesn't depend on other fluid libraries, or double free will occur.
-cc_library(paddle_inference_api_shared SHARED
-    SRCS api.cc api_impl.cc)
-add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
 
-if(NOT APPLE)
-  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/api.map")
-  set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
-    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
-    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n"
-    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
-    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
-    "endif()\n")
-  add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
-    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
-    DEPENDS paddle_inference_api_shared)
-  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
-endif()
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
@@ -90,6 +64,8 @@ endif()
 if (WITH_ANAKIN) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
+    # compile the libinference_anakin_api.a and compile with anakin.so.
+    fetch_include_recursively(${ANAKIN_INCLUDE})
     # compile the libinference_anakin_api.a and anakin.so.
     nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
     nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
@@ -98,9 +74,10 @@ if (WITH_ANAKIN) # only needed in CI
     target_link_libraries(inference_anakin_api anakin anakin_saber_common)
     target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
     if (WITH_TESTING)
-        cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
-                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
-                                  DEPS inference_anakin_api)
-        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+        # this test is unstable, disable it first.
+        #cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
+                                  #ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                                  #DEPS inference_anakin_api_shared)
+        #target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
      endif(WITH_TESTING)
 endif()
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index e74f23ff96..63c3f0d7b3 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <glog/logging.h>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 namespace paddle {
@@ -40,19 +41,36 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other)
 PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
 
 PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  if (!other.memory_owned_) {
+    data_ = other.data_;
+    length_ = other.length_;
+    memory_owned_ = other.memory_owned_;
+  } else {
+    Resize(other.length());
+    memcpy(data_, other.data(), other.length());
+    length_ = other.length();
+    memory_owned_ = true;
+  }
+  return *this;
+}
+
+PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
   // only the buffer with external memory can be copied
-  assert(!other.memory_owned_);
   data_ = other.data_;
   length_ = other.length_;
   memory_owned_ = other.memory_owned_;
+  other.data_ = nullptr;
+  other.length_ = 0;
+  other.memory_owned_ = false;
   return *this;
 }
 
 void PaddleBuf::Resize(size_t length) {
   // Only the owned memory can be reset, the external memory can't be changed.
   if (length_ == length) return;
-  assert(memory_owned_);
-  Free();
+  if (memory_owned_) {
+    Free();
+  }
   data_ = new char[length];
   length_ = length;
   memory_owned_ = true;
@@ -68,7 +86,7 @@ void PaddleBuf::Reset(void* data, size_t length) {
 void PaddleBuf::Free() {
   if (memory_owned_ && data_) {
     assert(length_ > 0);
-    delete static_cast<char*>(data_);
+    delete[] static_cast<char*>(data_);
     data_ = nullptr;
     length_ = 0;
   }
diff --git a/paddle/fluid/inference/api/api.map b/paddle/fluid/inference/api/api.map
deleted file mode 100644
index 5203784dc1..0000000000
--- a/paddle/fluid/inference/api/api.map
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-	global:
-		*paddle*;
-	local:
-		*;
-};
diff --git a/paddle/fluid/inference/api/api.sym b/paddle/fluid/inference/api/api.sym
deleted file mode 100644
index ef2a04d788..0000000000
--- a/paddle/fluid/inference/api/api.sym
+++ /dev/null
@@ -1 +0,0 @@
-*paddle*
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index 0206ac6010..6b374ceefb 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -18,26 +18,36 @@
 
 namespace paddle {
 
-PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
     const AnakinConfig &config) {
   CHECK(Init(config));
 }
 
-bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
   if (!(graph_.load(config.model_file))) {
+    LOG(FATAL) << "fail to load graph from " << config.model_file;
     return false;
   }
-  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  auto inputs = graph_.get_ins();
+  for (auto &input_str : inputs) {
+    graph_.ResetBatchSize(input_str, config.max_batch_size);
+  }
   // optimization for graph
   if (!(graph_.Optimize())) {
     return false;
   }
   // construct executer
-  executor_.init(graph_);
+  if (executor_p_ == nullptr) {
+    executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                  anakin::Precision::FP32>(graph_, true);
+  }
   return true;
 }
 
-bool PaddleInferenceAnakinPredictor::Run(
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Run(
     const std::vector<PaddleTensor> &inputs,
     std::vector<PaddleTensor> *output_data, int batch_size) {
   for (const auto &input : inputs) {
@@ -46,7 +56,29 @@ bool PaddleInferenceAnakinPredictor::Run(
                  << "'s type is not float";
       return false;
     }
-    auto d_tensor_in_p = executor_.get_in(input.name);
+    auto d_tensor_in_p = executor_p_->get_in(input.name);
+    auto net_shape = d_tensor_in_p->valid_shape();
+    if (net_shape.size() != input.shape.size()) {
+      LOG(ERROR) << " input  " << input.name
+                 << "'s shape size should be equal to that of net";
+      return false;
+    }
+    int sum = 1;
+    for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
+    if (sum > net_shape.count()) {
+      graph_.Reshape(input.name, input.shape);
+      delete executor_p_;
+      executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                    anakin::Precision::FP32>(graph_, true);
+      d_tensor_in_p = executor_p_->get_in(input.name);
+    }
+
+    anakin::saber::Shape tmp_shape;
+    for (auto s : input.shape) {
+      tmp_shape.push_back(s);
+    }
+    d_tensor_in_p->reshape(tmp_shape);
+
     float *d_data_p = d_tensor_in_p->mutable_data();
     if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
                    d_tensor_in_p->valid_size() * sizeof(float),
@@ -56,16 +88,17 @@ bool PaddleInferenceAnakinPredictor::Run(
     }
     cudaStreamSynchronize(NULL);
   }
-
-  executor_.prediction();
+  cudaDeviceSynchronize();
+  executor_p_->prediction();
+  cudaDeviceSynchronize();
 
   if (output_data->empty()) {
     LOG(ERROR) << "At least one output should be set with tensors' names.";
     return false;
   }
   for (auto &output : *output_data) {
-    auto *tensor = executor_.get_out(output.name);
-    output.shape = tensor->shape();
+    auto *tensor = executor_p_->get_out(output.name);
+    output.shape = tensor->valid_shape();
     if (output.data.length() < tensor->valid_size() * sizeof(float)) {
       output.data.Resize(tensor->valid_size() * sizeof(float));
     }
@@ -81,19 +114,23 @@ bool PaddleInferenceAnakinPredictor::Run(
   return true;
 }
 
-anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-    &PaddleInferenceAnakinPredictor::get_executer() {
-  return executor_;
+template <typename Target>
+anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor<Target>::get_executer() {
+  return *executor_p_;
 }
 
 // the cloned new Predictor of anakin share the same net weights from original
 // Predictor
-std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+template <typename Target>
+std::unique_ptr<PaddlePredictor>
+PaddleInferenceAnakinPredictor<Target>::Clone() {
   VLOG(3) << "Anakin Predictor::clone";
-  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  std::unique_ptr<PaddlePredictor> cls(
+      new PaddleInferenceAnakinPredictor<Target>());
   // construct executer from other graph
   auto anakin_predictor_p =
-      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+      dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
   if (!anakin_predictor_p) {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
@@ -103,14 +140,28 @@ std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
   return std::move(cls);
 }
 
+template class PaddleInferenceAnakinPredictor<anakin::NV>;
+template class PaddleInferenceAnakinPredictor<anakin::X86>;
+
 // A factory to help create difference predictor.
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
   VLOG(3) << "Anakin Predictor create.";
-  std::unique_ptr<PaddlePredictor> x(
-      new PaddleInferenceAnakinPredictor(config));
-  return x;
-}
+  if (config.target_type == AnakinConfig::NVGPU) {
+    VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::NV>(config));
+    return x;
+  } else if (config.target_type == AnakinConfig::X86) {
+    VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::X86>(config));
+    return x;
+  } else {
+    VLOG(3) << "Anakin Predictor create on unknown platform.";
+    return nullptr;
+  }
+};
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index def096c867..836badd979 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -20,14 +20,16 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-// from anakin
 #include "framework/core/net/net.h"
+#include "framework/graph/graph.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "saber/core/shape.h"
 #include "saber/saber_types.h"
 
 namespace paddle {
 
+template <typename Target>
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  public:
   PaddleInferenceAnakinPredictor() {}
@@ -42,19 +44,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
 
   std::unique_ptr<PaddlePredictor> Clone() override;
 
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
   get_executer();
 
-  ~PaddleInferenceAnakinPredictor() override{};
+  ~PaddleInferenceAnakinPredictor() override {
+    delete executor_p_;
+    executor_p_ = nullptr;
+  };
 
  private:
   bool Init(const AnakinConfig& config);
 
-  anakin::graph::Graph<anakin::NV, anakin::saber::AK_FLOAT,
-                       anakin::Precision::FP32>
+  anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
       graph_;
-  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
-      executor_;
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
+      executor_p_{nullptr};
   AnakinConfig config_;
 };
 
diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
index d6d631bfba..62e820b68c 100644
--- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-DEFINE_string(model, "", "Directory of the inference model.");
+DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
 
 namespace paddle {
 
 AnakinConfig GetConfig() {
   AnakinConfig config;
+  // using AnakinConfig::X86 if you need to use cpu to do inference
+  config.target_type = AnakinConfig::NVGPU;
   config.model_file = FLAGS_model;
   config.device = 0;
   config.max_batch_size = 1;
@@ -36,28 +38,27 @@ TEST(inference, anakin) {
       CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
 
   float data[1 * 3 * 224 * 224] = {1.0f};
-
-  PaddleTensor tensor{.name = "input_0",
-                      .shape = std::vector<int>({1, 3, 224, 224}),
-                      .data = PaddleBuf(data, sizeof(data)),
-                      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({1, 3, 224, 224});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::FLOAT32;
 
   // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> paddle_tensor_feeds;
-  paddle_tensor_feeds.emplace_back(std::move(tensor));
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
 
-  PaddleTensor tensor_out{.name = "prob_out",
-                          .shape = std::vector<int>({1000, 1}),
-                          .data = PaddleBuf(),
-                          .dtype = PaddleDType::FLOAT32};
+  PaddleTensor tensor_out;
+  tensor_out.name = "prob_out";
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
 
-  std::vector<PaddleTensor> outputs;
-  outputs.emplace_back(std::move(tensor_out));
+  std::vector<PaddleTensor> outputs(1, tensor_out);
 
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
 
   float* data_o = static_cast<float*>(outputs[0].data.data());
-  for (size_t j = 0; j < 1000; ++j) {
+  for (size_t j = 0; j < outputs[0].data.length(); ++j) {
     LOG(INFO) << "output[" << j << "]: " << data_o[j];
   }
 }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 58fd7c6f8b..08d7af6d3a 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
     std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                 inputs[i].data.length());
+    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
+    framework::LoD lod;
+    for (auto &level : inputs[i].lod) {
+      lod.emplace_back(level);
+    }
+    input.set_lod(lod);
+
     feeds->push_back(input);
   }
   return true;
@@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch(
       buffer.Resize(sizeof(float) * data.size());
     }
     std::memcpy(buffer.data(), data.data(), buffer.length());
+    // copy LoD
+    for (const auto &level : fetchs[i].lod()) {
+      outputs->at(i).lod.emplace_back(level);
+    }
     outputs->at(i).dtype = PaddleDType::FLOAT32;
     // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
   }
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index c0891e9c28..45b5a7638b 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
   void OptimizeInferenceProgram() {
     // Analyze inference_program
     Argument argument;
+    if (!config_.model_dir.empty()) {
+      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
+    } else {
+      PADDLE_ENFORCE(
+          !config_.param_file.empty(),
+          "Either model_dir or (param_file, prog_file) should be set.");
+      PADDLE_ENFORCE(!config_.prog_file.empty());
+      argument.fluid_model_program_path.reset(
+          new std::string(config_.prog_file));
+      argument.fluid_model_param_path.reset(
+          new std::string(config_.param_file));
+    }
     argument.origin_program_desc.reset(
         new ProgramDesc(*inference_program_->Proto()));
     Singleton<Analyzer>::Global().Run(&argument);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
index 62d98a7967..fcbf9b89d6 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
     std::vector<int64_t> data(20);
     for (int i = 0; i < 20; i++) data[i] = i;
 
-    PaddleTensor tensor{
-        .name = "",
-        .shape = std::vector<int>({10, 1}),
-        .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
-        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({10, 1});
+    tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t));
+    tensor.dtype = PaddleDType::INT64;
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 7f9bb4b33e..ba73a6eaa6 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -55,11 +55,9 @@ endif()
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.a
       ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
 else()
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.so
       ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
 endif()
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
new file mode 100755
index 0000000000..0d9f3d2aa2
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -0,0 +1,4 @@
+set -x
+cd `dirname $0`
+rm -rf build/ data/
+set +x
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 5f96fecf93..03ac79e9ed 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -47,10 +47,10 @@ void Main(bool use_gpu) {
     //# 2. Prepare input.
     int64_t data[4] = {1, 2, 3, 4};
 
-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor;
+    tensor.shape = std::vector<int>({4, 1});
+    tensor.data = PaddleBuf(data, sizeof(data));
+    tensor.dtype = PaddleDType::INT64;
 
     // For simplicity, we set all the slots with the same data.
     std::vector<PaddleTensor> slots(4, tensor);
@@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) {
       for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
         // 2. Dummy Input Data
         int64_t data[4] = {1, 2, 3, 4};
-        PaddleTensor tensor{.name = "",
-                            .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+        PaddleTensor tensor;
+        tensor.shape = std::vector<int>({4, 1});
+        tensor.data = PaddleBuf(data, sizeof(data));
+        tensor.dtype = PaddleDType::INT64;
+
         std::vector<PaddleTensor> inputs(4, tensor);
         std::vector<PaddleTensor> outputs;
         // 3. Run
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 0a2a2b713a..3800d49b34 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
 #include <fstream>
 #include <iostream>
+#include "paddle/fluid/inference/demo_ci/utils.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "utils.h"
 
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -123,11 +123,11 @@ void Main(bool use_gpu) {
   file.close();
 
   // Inference.
-  PaddleTensor input{
-      .name = "xx",
-      .shape = record.shape,
-      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
-      .dtype = PaddleDType::FLOAT32};
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
 
   VLOG(3) << "run executor";
   std::vector<PaddleTensor> output;
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 2f8b4f8596..b24414e824 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -40,6 +40,7 @@ class PaddleBuf {
   // Copy only available when memory is managed externally.
   explicit PaddleBuf(const PaddleBuf&);
   PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
   // Do not own the memory.
   PaddleBuf(void* data, size_t length)
       : data_(data), length_(length), memory_owned_{false} {}
@@ -67,9 +68,9 @@ struct PaddleTensor {
   PaddleTensor() = default;
   std::string name;  // variable name.
   std::vector<int> shape;
-  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
   PaddleBuf data;  // blob of data.
   PaddleDType dtype;
+  std::vector<std::vector<uint64_t>> lod;  // lod data
 };
 
 enum class PaddleEngineKind {
@@ -126,9 +127,11 @@ struct NativeConfig : public PaddlePredictor::Config {
 
 // Configurations for Anakin engine.
 struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
   int device;
   std::string model_file;
   int max_batch_size{-1};
+  TargetType target_type;
 };
 
 struct TensorRTConfig : public NativeConfig {
diff --git a/paddle/fluid/inference/api/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
similarity index 64%
rename from paddle/fluid/inference/api/check_symbol.sh
rename to paddle/fluid/inference/check_symbol.sh
index 6547ca1413..12b7b3e7e5 100755
--- a/paddle/fluid/inference/api/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -3,8 +3,8 @@
 lib=$1
 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
 
-num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l)
-num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l)
+num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
+num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep T | wc -l)
 
 if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
 if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 748f5a084e..8f42a37cd3 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,7 +1,8 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-  SRCS mul_op.cc conv2d_op.cc fc_op.cc
-  DEPS tensorrt_engine mul_op)
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+activation_op.cc
+  DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
   ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
@@ -13,3 +14,10 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
+nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
+nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+
+nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 8e7e23377d..dba1d50b2d 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -20,11 +20,60 @@ namespace tensorrt {
 
 class Conv2dOpConverter : public OpConverter {
  public:
-  Conv2dOpConverter() {}
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("Input").front());
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
+    const int n_output = Y_t->dims()[0];
+    const int filter_h = Y_t->dims()[2];
+    const int filter_w = Y_t->dims()[3];
+
+    const int groups = boost::get<int>(op_desc.GetAttr("groups"));
+    const std::vector<int> dilations =
+        boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
+    const std::vector<int> strides =
+        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+    const std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+    nvinfer1::DimsHW nv_ksize(filter_h, filter_w);
+    nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]);
+    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Convolution, *const_cast<nvinfer1::ITensor*>(X), n_output,
+        nv_ksize, weight.get(), bias.get());
+    PADDLE_ENFORCE(layer != nullptr);
+    layer->setStride(nv_strides);
+    layer->setPadding(nv_paddings);
+    layer->setDilation(nv_dilations);
+    layer->setNbGroups(groups);
+
+    auto output_name = op_desc.Output("Output").front();
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
new file mode 100644
index 0000000000..3744550f60
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -0,0 +1,210 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ElementwiseWeightOpConverter : public OpConverter {
+ public:
+  ElementwiseWeightOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+
+    std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
+    if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
+      if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
+    }
+
+    if (static_cast<int>(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) {
+      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+    } else if (static_cast<int>(dims_y.size()) == dims_x.nbDims &&
+               dims_y[0] == dims_x.d[0]) {
+      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+      for (int i = 1; i < dims_x.nbDims; i++) {
+        if (dims_y[i] != dims_x.d[i]) {
+          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+          break;
+        }
+      }
+      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
+        for (int i = 1; i < dims_x.nbDims; i++) {
+          if (dims_y[i] != 1)
+            PADDLE_THROW(
+                "TensorRT unsupported weight shape for Elementwise op!");
+        }
+      }
+    } else {
+      PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
+    }
+
+    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
+                                         static_cast<void*>(weight_data),
+                                         Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    nvinfer1::IScaleLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *const_cast<nvinfer1::ITensor*>(X), scale_mode,
+        shift_weights.get(), scale_weights.get(), power_weights.get());
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+class ElementwiseTensorOpConverter : public OpConverter {
+ public:
+  ElementwiseTensorOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid elementwise op to tensorrt IScaleLayer";
+
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
+    nvinfer1::Dims dims_x = X->getDimensions();
+    nvinfer1::Dims dims_y = Y->getDimensions();
+
+    // The two input tensor should have the same dims
+    PADDLE_ENFORCE(dims_x.nbDims >= 3);
+    if (dims_x.nbDims == dims_y.nbDims) {
+      for (int i = 0; i < dims_x.nbDims; i++) {
+        if (dims_x.d[i] != dims_y.d[i])
+          PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
+      }
+    } else {
+      PADDLE_THROW("TensorRT unsupported tensor shape for Elementwise op!");
+    }
+
+    auto op_pair = ops.find(op_type_);
+    if (op_pair == ops.end()) {
+      PADDLE_THROW("Wrong elementwise op type!");
+    }
+    nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
+        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+
+ protected:
+  static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
+      ops;
+  std::string op_type_;
+};
+
+const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
+    ElementwiseTensorOpConverter::ops = {
+        {"add", nvinfer1::ElementWiseOperation::kSUM},
+        {"mul", nvinfer1::ElementWiseOperation::kPROD},
+        {"sub", nvinfer1::ElementWiseOperation::kSUB},
+        {"div", nvinfer1::ElementWiseOperation::kDIV},
+        {"min", nvinfer1::ElementWiseOperation::kMIN},
+        {"pow", nvinfer1::ElementWiseOperation::kPOW},
+        {"max", nvinfer1::ElementWiseOperation::kMAX},
+};
+
+class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
+};
+
+class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMulOpConverter() { op_type_ = "mul"; }
+};
+
+class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorSubOpConverter() { op_type_ = "sub"; }
+};
+
+class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorDivOpConverter() { op_type_ = "div"; }
+};
+
+class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMinOpConverter() { op_type_ = "min"; }
+};
+
+class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorMaxOpConverter() { op_type_ = "max"; }
+};
+
+class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
+ public:
+  ElementwiseTensorPowOpConverter() { op_type_ = "pow"; }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, ElementwiseWeightOpConverter);
+
+REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
+                          ElementwiseTensorAddOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor,
+                          ElementwiseTensorSubOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor,
+                          ElementwiseTensorDivOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor,
+                          ElementwiseTensorMulOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor,
+                          ElementwiseTensorMaxOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor,
+                          ElementwiseTensorMinOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor,
+                          ElementwiseTensorPowOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 409efac679..39fe1f609d 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -38,7 +38,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
 }
 // indata c * k
 // Reorder the data layout from CK to KC.
-void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,  // NOLINT
                    TensorRTEngine::Weight* oweights) {
   int c = iweights.dims[0];
   int k = iweights.dims[1];
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 3c34295736..514eb659a8 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -49,5 +49,4 @@ class MulOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(mul);
 REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 968f7eb99c..41faaf7212 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -55,6 +55,32 @@ class OpConverter {
         it = Registry<OpConverter>::Lookup("fc");
       }
     }
+    if (op_desc.Type().find("elementwise") != std::string::npos) {
+      static std::unordered_set<std::string> add_tensor_op_set{
+          "add", "mul", "sub", "div", "max", "min", "pow"};
+      // TODO(xingzhaolong): all mul, sub, div
+      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
+      // "sub", "div"};
+      static std::unordered_set<std::string> add_weight_op_set{"add"};
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      int op_type_len = op_desc.Type().size();
+      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
+                       "Unsupported elementwise type" + op_type);
+        it =
+            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_weight");
+        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                                op_desc.Type());
+      } else {
+        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
+                       "Unsupported elementwise type" + op_type);
+        it =
+            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
+      }
+    }
+
     if (!it) {
       it = Registry<OpConverter>::Lookup(op_desc.Type());
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
new file mode 100644
index 0000000000..11cad95361
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
+ */
+class Pool2dOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4)
+        << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+
+    std::string pool_type =
+        boost::get<std::string>(op_desc.GetAttr("pooling_type"));
+    std::vector<int> ksize =
+        boost::get<std::vector<int>>(op_desc.GetAttr("ksize"));
+    std::vector<int> strides =
+        boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+    std::vector<int> paddings =
+        boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+
+    const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
+
+    nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
+    if (pool_type == "max") {
+      nv_pool_type = nvinfer1::PoolingType::kMAX;
+    } else if (pool_type == "avg") {
+      nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
+    } else {
+      PADDLE_THROW("TensorRT unsupported pooling type!");
+    }
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
+                                       *const_cast<nvinfer1::ITensor*>(input1),
+                                       nv_pool_type, nv_ksize);
+    PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
+    layer->setStride(nv_strides);
+    layer->setPadding(nv_paddings);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool2d);
+REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 7dabfd9f6a..e82762ea03 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(1);
+  validator.Execute(5);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
new file mode 100644
index 0000000000..f8711c6b60
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(conv2d_op, test) {
+  std::unordered_set<std::string> parameters({"conv2d-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+
+  validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5));
+  validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3));
+  validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("conv2d");
+  desc.SetInput("Input", {"conv2d-X"});
+  desc.SetInput("Filter", {"conv2d-Y"});
+  desc.SetOutput("Output", {"conv2d-Out"});
+
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+  desc.SetAttr("dilations", dilations);
+  desc.SetAttr("groups", groups);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(3);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(conv2d);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
new file mode 100644
index 0000000000..7537d02a35
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(elementwise_op, add_weight_test) {
+  std::unordered_set<std::string> parameters({"elementwise_add-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1 << 15);
+  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+  validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1));
+  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add-X"});
+  desc.SetInput("Y", {"elementwise_add-Y"});
+  desc.SetOutput("Out", {"elementwise_add-Out"});
+
+  int axis = 1;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(8);
+}
+
+TEST(elementwise_op, add_tensor_test) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  TRTConvertValidation validator(8, parameters, scope, 1 << 15);
+  validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3));
+  validator.DeclInputVar("elementwise_add-Y", nvinfer1::Dims3(10, 3, 3));
+  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
+  validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("elementwise_add");
+  desc.SetInput("X", {"elementwise_add-X"});
+  desc.SetInput("Y", {"elementwise_add-Y"});
+  desc.SetOutput("Out", {"elementwise_add-Out"});
+
+  // the defalut axis of elementwise op is -1
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(8);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(elementwise_add);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
index 081f4d6059..1ae2668e73 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -24,9 +24,8 @@ TEST(fc_op, test) {
   std::unordered_set<std::string> parameters({"mul-Y"});
   framework::Scope scope;
   TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1));
+  validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1));
   validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
-  // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
 
   // Prepare Op description
@@ -38,7 +37,7 @@ TEST(fc_op, test) {
 
   validator.SetOp(*desc.Proto());
 
-  validator.Execute(1);
+  validator.Execute(10);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index 674f37f2fd..3d34cd7d5d 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -23,7 +23,7 @@ namespace tensorrt {
 TEST(MulOpConverter, main) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
-  TRTConvertValidation validator(10, parameters, scope, 1000);
+  TRTConvertValidation validator(10, parameters, scope, 1000, false);
   validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
   validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
   validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) {
   validator.SetOp(*desc.Proto());
   LOG(INFO) << "execute";
 
-  validator.Execute(1);
+  validator.Execute(2);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9b79f86b0e..d6651a5b24 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -25,12 +25,42 @@ TEST(OpConverter, ConvertBlock) {
   framework::ProgramDesc prog;
   auto* block = prog.MutableBlock(0);
   auto* conv2d_op = block->AppendOp();
+
+  // init trt engine
+  cudaStream_t stream_;
+  std::unique_ptr<TensorRTEngine> engine_;
+  engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
+  engine_->InitNetwork();
+  PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+
+  engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
+                        nvinfer1::Dims3(2, 5, 5));
+
   conv2d_op->SetType("conv2d");
+  conv2d_op->SetInput("Input", {"conv2d-X"});
+  conv2d_op->SetInput("Filter", {"conv2d-Y"});
+  conv2d_op->SetOutput("Output", {"conv2d-Out"});
 
-  OpConverter converter;
+  const std::vector<int> strides({1, 1});
+  const std::vector<int> paddings({1, 1});
+  const std::vector<int> dilations({1, 1});
+  const int groups = 1;
+
+  conv2d_op->SetAttr("strides", strides);
+  conv2d_op->SetAttr("paddings", paddings);
+  conv2d_op->SetAttr("dilations", dilations);
+  conv2d_op->SetAttr("groups", groups);
+
+  // init scope
   framework::Scope scope;
-  converter.ConvertBlock(*block->Proto(), {}, scope,
-                         nullptr /*TensorRTEngine*/);
+  std::vector<int> dim_vec = {3, 2, 3, 3};
+  auto* x = scope.Var("conv2d-Y");
+  auto* x_tensor = x->GetMutable<framework::LoDTensor>();
+  x_tensor->Resize(framework::make_ddim(dim_vec));
+
+  OpConverter converter;
+  converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope,
+                         engine_.get() /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
new file mode 100644
index 0000000000..c5dddbc8cd
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(Pool2dOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(5, parameters, scope, 1 << 15);
+
+  // The ITensor's Dims should not contain the batch size.
+  // So, the ITensor's Dims of input and output should be C * H * W.
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
+  validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("pool2d");
+  desc.SetInput("X", {"pool2d-X"});
+  desc.SetOutput("Out", {"pool2d-Out"});
+
+  std::vector<int> ksize({2, 2});
+  std::vector<int> strides({2, 2});
+  std::vector<int> paddings({0, 0});
+  std::string pooling_t = "max";
+
+  desc.SetAttr("pooling_type", pooling_t);
+  desc.SetAttr("ksize", ksize);
+  desc.SetAttr("strides", strides);
+  desc.SetAttr("paddings", paddings);
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(3);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool2d);
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index f14885b238..63c2f978f2 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -63,13 +63,16 @@ class TRTConvertValidation {
  public:
   TRTConvertValidation() = delete;
 
-  TRTConvertValidation(int batch_size,
+  TRTConvertValidation(int max_batch_size,
                        const std::unordered_set<std::string>& parameters,
                        framework::Scope& scope,  // NOLINT
-                       int workspace_size = 1 << 10)
-      : parameters_(parameters), scope_(scope) {
+                       int workspace_size = 1 << 10, bool if_add_batch = true)
+      : parameters_(parameters),
+        scope_(scope),
+        if_add_batch_(if_add_batch),
+        max_batch_size_(max_batch_size) {
     // create engine.
-    engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_));
+    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
     engine_->InitNetwork();
 
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
@@ -84,7 +87,7 @@ class TRTConvertValidation {
 
   // Declare a parameter varaible in the scope.
   void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
-    DeclVar(name, dims);
+    DeclVar(name, dims, true);
   }
 
   void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
@@ -92,12 +95,18 @@ class TRTConvertValidation {
   }
 
   // Declare a variable in a fluid Scope.
-  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
+  void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
+               bool is_param = false) {
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
 
     // Init Fluid tensor.
     std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
+    // There is no batchsize in ITensor's shape, but We should add it to
+    // tensor's shape of fluid. If the variable is not parameter and the
+    // if_add_batch_ flag is true, add the max batchsize to dim_vec.
+    if (is_param != true && if_add_batch_ == true)
+      dim_vec.insert(dim_vec.begin(), max_batch_size_);
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -131,6 +140,7 @@ class TRTConvertValidation {
 
   void Execute(int batch_size) {
     // Execute Fluid Op
+    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
     platform::CPUPlace place;
     platform::CPUDeviceContext ctx(place);
     op_->Run(scope_, place);
@@ -139,7 +149,7 @@ class TRTConvertValidation {
     cudaStreamSynchronize(*engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
-    const size_t output_space_size = 2000;
+    const size_t output_space_size = 3000;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
@@ -149,9 +159,15 @@ class TRTConvertValidation {
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
       framework::TensorToVector(*tensor, ctx, &fluid_out);
+
+      size_t fluid_out_size = fluid_out.size();
+      if (if_add_batch_ == true) {
+        fluid_out_size =
+            batch_size * (framework::product(tensor->dims()) / max_batch_size_);
+      }
       // Compare two output
       ASSERT_FALSE(fluid_out.empty());
-      for (size_t i = 0; i < fluid_out.size(); i++) {
+      for (size_t i = 0; i < fluid_out_size; i++) {
         // Loose the threshold for CI in different machine model.
         EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
       }
@@ -167,6 +183,12 @@ class TRTConvertValidation {
   std::unique_ptr<framework::OpDesc> op_desc_;
   const std::unordered_set<std::string>& parameters_;
   framework::Scope& scope_;
+  // The ITensor of trt does not cotain the batch size,
+  // bug, in most cases, we need to set batch size for
+  // fluid's tensor shape. This variable indicates
+  // whether to add batch size to tensor shape of fluid.
+  bool if_add_batch_;
+  int max_batch_size_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index f8732e51b6..dc03702990 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -113,7 +113,7 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
   ASSERT_EQ(y_cpu[1], 14.5);
 }
 
-TEST_F(TensorRTEngineTest, test_conv2d_temp) {
+TEST_F(TensorRTEngineTest, test_conv2d) {
   // Weight in CPU memory.
   float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
   float raw_bias[1] = {0};
@@ -146,6 +146,37 @@ TEST_F(TensorRTEngineTest, test_conv2d_temp) {
   ASSERT_EQ(y_cpu[1], 6.0);
 }
 
+TEST_F(TensorRTEngineTest, test_pool2d) {
+  // Weight in CPU memory.
+  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::Dims3{1, 2, 2});
+
+  nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
+  auto* pool_layer =
+      TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x),
+                           pool_t, nvinfer1::DimsHW{2, 2});
+
+  PADDLE_ENFORCE(pool_layer != nullptr);
+  pool_layer->setStride(nvinfer1::DimsHW{1, 1});
+  pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
+
+  engine_->DeclareOutput(pool_layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
+  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
+                           8 * sizeof(float));
+  engine_->Execute(2);
+
+  LOG(INFO) << "to get output";
+  float* y_cpu = new float[2];
+  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+
+  ASSERT_EQ(y_cpu[0], 2.0);
+  ASSERT_EQ(y_cpu[1], 5.0);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 2fa5a9540b..017fc4cd7b 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME)
     string(REGEX REPLACE "^_$" "" arg "${arg}")
     cc_test(test_inference_${TARGET_NAME}${arg}
         SRCS test_inference_${TARGET_NAME}.cc
-        DEPS paddle_fluid
+        DEPS paddle_fluid_origin
         ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})
@@ -43,6 +43,6 @@ inference_test(word2vec)
 # TODO(TJ): clean me up
 cc_test(test_inference_nlp
   SRCS test_inference_nlp.cc
-  DEPS paddle_fluid
+  DEPS paddle_fluid_origin
   ARGS
   --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 5cc1db12bb..e2a3e9d46e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -20,9 +20,6 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#ifdef PADDLE_WITH_MKLML
-#include <omp.h>
-#endif
 
 DEFINE_string(model_path, "", "Directory of the inference model.");
 DEFINE_string(data_file, "", "File of input index data.");
@@ -30,6 +27,7 @@ DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
 DECLARE_bool(use_mkldnn);
+DECLARE_int32(paddle_num_threads);
 
 inline double GetCurrentMs() {
   struct timeval time;
@@ -160,12 +158,7 @@ TEST(inference, nlp) {
   std::unique_ptr<paddle::framework::Scope> scope(
       new paddle::framework::Scope());
 
-#ifdef PADDLE_WITH_MKLML
-  // only use 1 thread number per std::thread
-  omp_set_dynamic(0);
-  omp_set_num_threads(1);
-  paddle::platform::SetNumThreads(1);
-#endif
+  paddle::platform::SetNumThreads(FLAGS_paddle_num_threads);
 
   double start_ms = 0, stop_ms = 0;
   if (FLAGS_num_threads > 1) {
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 01a8501dd4..c2f45fdc99 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -15,6 +15,10 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"
 
+DEFINE_bool(free_idle_memory, false,
+            "If it is true, Paddle will try to free idle memory trunks during "
+            "running time.");
+
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -152,13 +156,14 @@ void BuddyAllocator::Free(void* p) {
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
-  // Clean up if existing too much free memory
-
-  // Prefer freeing fallback allocation first
-  CleanIdleFallBackAlloc();
+  if (FLAGS_free_idle_memory) {
+    // Clean up if existing too much free memory
+    // Prefer freeing fallback allocation first
+    CleanIdleFallBackAlloc();
 
-  // Free normal allocation
-  CleanIdleNormalAlloc();
+    // Free normal allocation
+    CleanIdleNormalAlloc();
+  }
 }
 
 size_t BuddyAllocator::Used() { return total_used_; }
diff --git a/paddle/fluid/operators/.flatten_op.cc.swp b/paddle/fluid/operators/.flatten_op.cc.swp
new file mode 100644
index 0000000000..3395b6074b
Binary files /dev/null and b/paddle/fluid/operators/.flatten_op.cc.swp differ
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4e2002ad24..4c3b8ec781 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -192,9 +192,9 @@ if(WITH_DISTRIBUTE)
     
     set(DISTRIBUTE_DEPS "")
     if(WITH_GRPC)
-        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
     else()
-        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
         if(WITH_BRPC_RDMA)
             find_library(IBVERBS_LIBRARY NAMES ibverbs)
             ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
@@ -270,6 +270,9 @@ op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 op_library(unsqueeze_op DEPS reshape_op)
 op_library(squeeze_op DEPS reshape_op)
+op_library(extract_rows_op DEPS memory)
+op_library(flatten_op DEPS reshape_op)
+
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 1828be57b5..22cbf680c0 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
-DEFINE_bool(cudnn_deterministic, true,
+DEFINE_bool(cudnn_deterministic, false,
             "Whether allow using an autotuning algorithm for convolution "
             "operator. The autotuning algorithm may be non-deterministic. If "
-            "false, the algorithm is deterministic.");
+            "true, the algorithm is deterministic.");
 
 namespace paddle {
 namespace operators {
@@ -77,7 +77,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
@@ -129,7 +129,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
 
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
         workspace_size_limit, &algo));
@@ -140,18 +140,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     if (dev_ctx.GetComputeCapability() >= 70 &&
         std::type_index(typeid(T)) ==
             std::type_index(typeid(platform::float16))) {
-      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
       // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else {
-      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
 #endif
 
     // get workspace size able to allocate
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
     // It is possible for float16 on Volta GPU to allocate more memory than
@@ -165,7 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
           cudnn_filter_desc, filter_data + i * group_offset_filter,
           cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
@@ -218,7 +218,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it mannually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
         cudnn_conv_desc, groups));
     groups = 1;
 #endif
@@ -272,8 +272,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
-      if (FLAGS_cudnn_deterministic) {
-        PADDLE_ENFORCE(
+      if (!FLAGS_cudnn_deterministic) {
+        CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
                 // dyDesc: Handle to the previously initialized input
@@ -289,7 +289,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
 
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
               cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
@@ -297,8 +297,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      if (FLAGS_cudnn_deterministic) {
-        PADDLE_ENFORCE(
+      if (!FLAGS_cudnn_deterministic) {
+        CUDNN_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
                 cudnn_conv_desc, cudnn_filter_desc,
@@ -308,7 +308,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
 
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
               cudnn_filter_desc, filter_algo, &tmp_size));
@@ -326,7 +326,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset input_grad.
 
       for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
             handle, &alpha, cudnn_filter_desc,
             filter_data + i * group_offset_filter, cudnn_output_grad_desc,
             output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
@@ -339,7 +339,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
       for (int i = 0; i < groups; i++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
             cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
             cudnn_conv_desc, filter_algo, cudnn_workspace,
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 5098bd8700..f07ab5a33b 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -55,7 +55,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(src_pd, user_pd, user_memory_p,
@@ -64,7 +64,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
@@ -80,7 +80,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
@@ -89,7 +89,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
     auto user_pd = user_weights_memory_p->get_primitive_desc();
     return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
@@ -109,7 +109,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto src_pd = conv_pd_->src_primitive_desc();
     auto user_pd = user_memory_p->get_primitive_desc();
     return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
@@ -118,7 +118,7 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
       const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
     auto weights_pd = conv_pd_->weights_primitive_desc();
     return this->AcquireMemory(weights_pd, user_weights_pd,
@@ -197,12 +197,12 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
 
   // Generate keys for storing/retriving primitives for this operator
   // TODO(jczaja): Make hashing function more optimial
-  static std::string GetHash(memory::dims& input_dims,
-                             memory::dims& weights_dims,
-                             std::vector<int>& strides,
-                             std::vector<int>& paddings,
-                             std::vector<int>& dilations, int groups,
-                             const std::string& suffix) {
+  static std::string GetHash(memory::dims& input_dims,     // NOLINT
+                             memory::dims& weights_dims,   // NOLINT
+                             std::vector<int>& strides,    // NOLINT
+                             std::vector<int>& paddings,   // NOLINT
+                             std::vector<int>& dilations,  // NOLINT
+                             int groups, const std::string& suffix) {
     return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
            dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
            suffix;
@@ -280,12 +280,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      * ('any') which lets a primitive (convolution in this case) choose
      * the memory format preferred for best performance
      */
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+
     auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
     // create a conv primitive descriptor and save it for usage in backward
     std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
@@ -423,16 +427,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      * ('any') which lets a primitive (conv backward in this case) choose
      * the memory format preferred for best performance
      */
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+
     auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T>(), memory::format::any);
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
     // Retrieve conv_pd from device context
     auto conv_pd =
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 038ea89990..82fff68e75 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -87,7 +87,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     // Get the algorithm
-    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
         handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
         // dxDesc: Handle to the previously initialized output tensor
         // descriptor.
@@ -95,7 +95,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
         workspace_size_limit, &algo));
 
     // get workspace size able to allocate
-    PADDLE_ENFORCE(
+    CUDNN_ENFORCE(
         platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
             handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
             cudnn_output_desc, algo, &workspace_size_in_bytes));
@@ -110,7 +110,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
     for (int g = 0; g < groups; g++) {
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
           handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
           cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
           algo, cudnn_workspace, workspace_size_in_bytes, &beta,
@@ -178,11 +178,11 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
       // choose backward algorithm for data
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
           handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
           workspace_size_limit, &data_algo));
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
           handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_input_desc, data_algo, &fwd_ws_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
@@ -190,7 +190,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
     if (filter_grad) {
       // choose backward algorithm for filter
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
               cudnn_filter_desc,
@@ -198,7 +198,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
               workspace_size_limit, &filter_algo));
 
       // get workspace for backwards filter algorithm
-      PADDLE_ENFORCE(
+      CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
               handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
               cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
@@ -222,7 +222,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
       for (int g = 0; g < groups; g++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
             handle, &alpha, cudnn_output_desc,
             output_grad_data + output_grad_offset * g, cudnn_filter_desc,
             filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
@@ -237,7 +237,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       for (int g = 0; g < groups; g++) {
-        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_output_desc,
             output_grad_data + output_grad_offset * g, cudnn_input_desc,
             input_data + input_offset * g, cudnn_conv_desc, filter_algo,
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 1612927055..da5d20505e 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -17,9 +17,9 @@ if(WITH_GRPC)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
-     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
-  cc_test(rpc_server_test SRCS rpc_server_test.cc 
-    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_table_op SERIAL)
+    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+  cc_test(rpc_server_test SRCS rpc_server_test.cc
+    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
   return()
 endif()
 
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 265f964ddc..b4f60c9ff9 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -49,6 +49,7 @@ void GRPCClient::SendComplete() {
 }
 
 GRPCClient::~GRPCClient() {
+  stopped_ = true;
   Wait();
   cq_.Shutdown();
   {
@@ -275,7 +276,7 @@ void GRPCClient::Proceed() {
   void* tag = nullptr;
   bool ok = false;
 
-  while (cq_.Next(&tag, &ok)) {
+  while (!stopped_ && cq_.Next(&tag, &ok)) {
     BaseProcessor* c = static_cast<BaseProcessor*>(tag);
     GPR_ASSERT(ok);
     PADDLE_ENFORCE(c);
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index 8351d825f8..0c95ffeb5c 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
 
 class GRPCClient : public RPCClient {
  public:
-  GRPCClient() : ok_(true), completed_(false) {}
+  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
   virtual ~GRPCClient();
 
   bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
@@ -237,6 +237,8 @@ class GRPCClient : public RPCClient {
   // mutex for sending complete message only once
   std::mutex completed_mutex_;
   bool completed_;
+
+  volatile bool stopped_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 55995783c6..de1a503154 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -41,6 +41,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
 
   // Async
   if (!sync_mode_) {
+    rpc_server_->Profiler().OneStep();
     try {
       executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
                                     scope);
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 83b14fa64d..406e7294c1 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -18,11 +18,44 @@
 #include <string>
 
 #include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_int32(rpc_server_profile_period, 0,
+             "the period of listen_and_serv to do profile");
+DEFINE_string(rpc_server_profile_path, "/dev/null",
+              "the profile log file path");
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+RPCServerProfiler::RPCServerProfiler(int profile_period,
+                                     const std::string& profile_log_path)
+    : profile_period_(profile_period), profile_log_path_(profile_log_path) {
+  step_ = 0;
+}
+
+void RPCServerProfiler::OneStep() {
+  PADDLE_ENFORCE_LE(step_, profile_period_,
+                    "step_ should not be larger then "
+                    "profile_period_");
+  if (profile_period_ <= 0) {
+    return;
+  }
+
+  if (step_ == 0) {
+    auto pf_state = paddle::platform::ProfilerState::kCPU;
+    paddle::platform::EnableProfiler(pf_state);
+  }
+  if (step_ == profile_period_) {
+    paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
+                                      profile_log_path_);
+    step_ = 0;
+  } else {
+    step_++;
+  }
+}
+
 void RPCServer::ShutDown() {
   LOG(INFO) << "RPCServer ShutDown ";
   ShutDownImpl();
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index fd914d7a72..d813ba03e2 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -19,16 +19,33 @@
 #include <thread>  // NOLINT
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/operators/distributed/request_handler.h"
 
+DECLARE_int32(rpc_server_profile_period);
+DECLARE_string(rpc_server_profile_path);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+class RPCServerProfiler {
+ public:
+  RPCServerProfiler(int profile_period, const std::string& profile_log_path);
+  void OneStep();
+
+ private:
+  const int profile_period_;
+  std::string profile_log_path_;
+  int step_;
+};
+
 class RPCServer {
  public:
   explicit RPCServer(const std::string& address, int client_num)
       : cur_cond_(0),
+        profiler_(FLAGS_rpc_server_profile_period,
+                  FLAGS_rpc_server_profile_path),
         bind_address_(address),
         exit_flag_(false),
         selected_port_(0),
@@ -67,6 +84,7 @@ class RPCServer {
   void Complete();
 
   void ResetBarrierCounter();
+  RPCServerProfiler& Profiler() { return profiler_; }
 
  protected:
   virtual void ShutDownImpl() = 0;
@@ -79,6 +97,7 @@ class RPCServer {
   std::unordered_map<std::string, int> rpc_cond_map_;
   std::atomic<int> cur_cond_;
   std::condition_variable rpc_cond_;
+  RPCServerProfiler profiler_;
 
  protected:
   std::string bind_address_;
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index 9f2360ec70..b50830c362 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -30,7 +30,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace distributed = paddle::operators::distributed;
 
-USE_OP(lookup_table);
+USE_NO_KERNEL_OP(lookup_sparse_table);
 
 std::unique_ptr<distributed::RPCServer> g_rpc_service;
 std::unique_ptr<distributed::RequestHandler> g_req_handler;
@@ -42,13 +42,13 @@ framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
   framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
   framework::VariableNameMap output({{"Output", {"out"}}});
   auto op = block->AppendOp();
-  op->SetType("lookup_table");
+  op->SetType("lookup_sparse_table");
   op->SetInput("W", {"w"});
   op->SetInput("Ids", {"ids"});
   op->SetOutput("Out", {"out"});
 
   auto& out = *root_block->Var("out");
-  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetType(framework::proto::VarType::LOD_TENSOR);
   out.SetShape({10, 10});
 
   return block;
@@ -59,20 +59,19 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
   w_var->GetMutable<framework::SelectedRows>();
 
   auto out_var = scope->Var("out");
-  out_var->GetMutable<framework::SelectedRows>();
+  out_var->GetMutable<framework::LoDTensor>();
 
   auto ids_var = scope->Var("ids");
-  ids_var->GetMutable<framework::SelectedRows>();
+  ids_var->GetMutable<framework::LoDTensor>();
 }
 
 void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
                          int64_t rows_numel) {
   CreateVarsOnScope(scope, place);
-  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
-  auto rows = ids_var->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
-  ids_var->mutable_value()->Resize({rows_numel, 1});
-  ids_var->mutable_value()->mutable_data<float>(*place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
+  int64_t* ids_ptr =
+      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
+  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
 }
 
 void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
@@ -148,11 +147,11 @@ TEST(PREFETCH, CPU) {
     client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
     client->Wait();
     auto var = scope.Var(out_var_name);
-    auto value = var->GetMutable<framework::SelectedRows>()->value();
-    auto ptr = value.mutable_data<float>(place);
+    auto value = var->GetMutable<framework::LoDTensor>();
+    auto ptr = value->mutable_data<float>(place);
 
     for (int64_t i = 0; i < rows_numel; ++i) {
-      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+      EXPECT_EQ(ptr[0 + i * value->dims()[1]], static_cast<float>(i * 2));
     }
   }
 
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
index 1a5427b392..c86cd57316 100644
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -47,12 +47,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
 
     auto x_dims = x->dims();
-    auto y_dims = y->dims();
+    auto y_dims_untrimed = y->dims();
     auto z_dims = z->dims();
 
     // Execute default elementwise_add operator when
     // broadcast operations need to performed.
-    if (x_dims != y_dims) {
+    if (x_dims != y_dims_untrimed) {
       auto sum_func = [](T a, T b) -> T { return a + b; };
 
       TransformFunctor<decltype(sum_func), T,
@@ -62,11 +62,11 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
               ctx.template device_context<paddle::platform::CPUDeviceContext>(),
               sum_func);
 
-      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
       PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                      "Axis should be in range [0, x_dims)");
 
-      trim_trailing_singular_dims(&y_dims);
+      auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
       axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
       int pre, n, post;
@@ -88,7 +88,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
                      "Wrong layout/format set for Y tensor");
 
       std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
-      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims_untrimed);
       std::vector<int> dst_tz = framework::vectorize2int(z_dims);
 
       std::vector<memory::primitive_desc> srcs_pd;
@@ -142,36 +142,39 @@ class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
+    // skip out, x, y,
+    // dout length is larger or equal than dx, dy.
+    auto* out = dout;
+    auto *x = dout, *y = dout;
 
     auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
       in->set_layout(DataLayout::kMKLDNN);
       in->set_format(out->format());
     };
 
-    if (x->dims() == y->dims()) {
-      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
-      if (dx) {
-        blas.VCOPY(dout->numel(), dout->data<T>(),
-                   dx->mutable_data<T>(ctx.GetPlace()));
-        set_mkldnn_format(dx, dout);
-      }
-
-      if (dy) {
-        blas.VCOPY(dout->numel(), dout->data<T>(),
-                   dy->mutable_data<T>(ctx.GetPlace()));
-        set_mkldnn_format(dy, dout);
+    if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) {
+      if (dx->dims() == dy->dims()) {
+        auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+        if (dx) {
+          blas.VCOPY(dout->numel(), dout->data<T>(),
+                     dx->mutable_data<T>(ctx.GetPlace()));
+          set_mkldnn_format(dx, dout);
+        }
+
+        if (dy) {
+          blas.VCOPY(dout->numel(), dout->data<T>(),
+                     dy->mutable_data<T>(ctx.GetPlace()));
+          set_mkldnn_format(dy, dout);
+        }
       }
     } else {
       // Execute default kernel when broadcast is needed
-      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
-                          IdentityGrad<T>, IdentityGrad<T>>(
+      ElemwiseExplicitGradCompute<paddle::platform::CPUDeviceContext, T,
+                                  IdentityGrad<T>, IdentityGrad<T>>(
           ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
           IdentityGrad<T>());
     }
diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc
index d2c2053713..3c97ac995c 100644
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_add, "Add", "Out = X + Y");
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",
+                              "X");
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu
index dfff518f17..6cbf6066c9 100644
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -16,6 +16,60 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_add_op.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void ElementwiseAddCUDAKernel(const T *x, const T *y, T *z, int n,
+                                         int post, int size) {
+  int idx_x = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx_x < size) {
+    int idx_y = idx_x / post - (idx_x / (n * post)) * n;
+    z[idx_x] = x[idx_x] + y[idx_y];
+  }
+}
+
+template <typename T>
+class ElementwiseAddKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using Tensor = framework::Tensor;
+
+    const auto x = ctx.Input<Tensor>("X");
+    const auto y = ctx.Input<Tensor>("Y");
+    auto z = ctx.Output<Tensor>("Out");
+    auto *z_data = z->mutable_data<T>(ctx.GetPlace());
+
+    auto &device = *(ctx.cuda_device_context().eigen_device());
+    const framework::DDim &x_dim = x->dims();
+    framework::DDim y_dim = y->dims();
+    int size = x->numel();
+    if (x_dim == y_dim) {
+      auto dim = framework::make_ddim({size});
+      auto z_eigen = framework::EigenTensor<T, 1>::From(*z, dim);
+      auto x_eigen = framework::EigenTensor<T, 1>::From(*x, dim);
+      auto y_eigen = framework::EigenTensor<T, 1>::From(*y, dim);
+      z_eigen.device(device) = x_eigen + y_eigen;
+    } else {
+      int axis = ctx.Attr<int>("axis");
+      axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
+      y_dim = trim_trailing_singular_dims(y_dim);
+      axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+      int pre, n, post;
+      get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+      int threads = 512;
+      int grids = (size + threads - 1) / threads;
+      auto stream = ctx.cuda_device_context().stream();
+      ElementwiseAddCUDAKernel<T><<<grids, threads, 0, stream>>>(
+          x->data<T>(), y->data<T>(), z_data, n, post, size);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index baf04c30b1..0b19723720 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -95,9 +95,10 @@ void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
                                   framework::Tensor* dy) {
   int axis = ctx.Attr<int>("axis");
 
-  ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
-      IdentityGrad<T>());
+  ElemwiseExplicitGradCompute<DeviceContext, T, IdentityGrad<T>,
+                              IdentityGrad<T>>(ctx, *x, *y, *out, *dout, axis,
+                                               dx, dy, IdentityGrad<T>(),
+                                               IdentityGrad<T>());
 }
 
 template <typename DeviceContext, typename T>
@@ -140,18 +141,44 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
 
-    if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) {
-      elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
+    if (dx != nullptr) {
+      // In fact, we can just share memory, but it may cause a bug of memory
+      // optimizer
+      // dx->ShareDataWith(*dout);
+      framework::TensorCopy(*dout, ctx.GetPlace(),
+                            ctx.template device_context<DeviceContext>(), dx);
+    }
+
+    if (dy == nullptr) return;
+
+    const framework::DDim& x_dim = dout->dims();
+    framework::DDim y_dim = dy->dims();
+    if (x_dim == y_dim) {
+      // dy->ShareDataWith(*dout);
+      framework::TensorCopy(*dout, ctx.GetPlace(),
+                            ctx.template device_context<DeviceContext>(), dy);
     } else {
-      default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
-                                                     dy);
+      dy->mutable_data<T>(ctx.GetPlace());
+      // Perform reduction to dout to calculate dy
+      int axis = ctx.Attr<int>("axis");
+      axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
+      y_dim = trim_trailing_singular_dims(y_dim);
+      axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
+      auto& device =
+          *(ctx.template device_context<DeviceContext>().eigen_device());
+      int pre, n, post;
+      get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+      auto eigen_dout = framework::EigenTensor<T, 3>::From(
+          *dout, framework::make_ddim({pre, n, post}));
+      auto eigen_dy =
+          framework::EigenTensor<T, 1>::From(*dy, framework::make_ddim({n}));
+      eigen_dy.device(device) = eigen_dout.sum(
+          framework::EigenDim<2>::From(framework::make_ddim({0, 2})));
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
index 824b1221e5..84c8a65e5f 100644
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_div_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
+
 REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index bb88970e42..d8a12e800a 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -78,7 +78,9 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() final {
     AddInput("X", "(Tensor), The first input tensor of elementwise op.");
     AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    AddOutput("Out", "The output of elementwise op.").Reuse("X");
+    // AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
+    // memory.").AsIntermediate();
+    AddOutput("Out", "The output of elementwise op.");
     AddAttr<int>("axis",
                  "(int, default -1). The start dimension index "
                  "for broadcasting Y onto X.")
@@ -125,11 +127,13 @@ But the output only shares the LoD information with the input $X$.
 
 )DOC",
                                GetName(), GetEquation()));
+    SetReuse();
   }
 
  protected:
   virtual std::string GetName() const = 0;
   virtual std::string GetEquation() const = 0;
+  virtual void SetReuse() {}
 };
 
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
@@ -162,8 +166,8 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
 
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
@@ -175,9 +179,58 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
+
+// For Add, Sub op, the X, Out is not needed.
+class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
+ public:
+  using operators::ElementwiseOpGrad::ElementwiseOpGrad;
+  using operators::ElementwiseOpGrad::GetExpectedKernelType;
+  using Tensor = framework::Tensor;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+      ctx->SetOutputDim(x_grad_name, out_dims);
+    }
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+      auto y_dims = ctx->GetInputDim("Y");
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
+/*
+*/
+
+#define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name)                   \
+  class kernel_type##GradMaker                                               \
+      : public paddle::framework::SingleGradOpDescMaker {                    \
+   public:                                                                   \
+    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
+                                                                             \
+   protected:                                                                \
+    std::unique_ptr<paddle::framework::OpDesc> Apply() const override {      \
+      auto* op = new paddle::framework::OpDesc();                            \
+      op->SetType(#kernel_type "_grad");                                     \
+      op->SetInput("Y", Input("Y"));                                         \
+      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
+                   OutputGrad("Out"));                                       \
+      op->SetAttrMap(Attrs());                                               \
+      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
+      op->SetOutput(::paddle::framework::GradVarName("Y"), InputGrad("Y"));  \
+      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
+    }                                                                        \
+  }
+
 #define REGISTER_ELEMWISE_OP(op_type, op_name, equation)                \
   class __ElemwiseOp##op_type##Maker__                                  \
       : public ::paddle::operators::ElementwiseOpMaker {                \
@@ -190,3 +243,18 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
                     ::paddle::operators::ElementwiseOpInferVarType,     \
                     ::paddle::framework::DefaultGradOpDescMaker<true>); \
   REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
+
+#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
+  class __ElemwiseOp##op_type##Maker__                                 \
+      : public ::paddle::operators::ElementwiseOpMaker {               \
+   protected:                                                          \
+    virtual std::string GetName() const { return op_name; }            \
+    virtual std::string GetEquation() const { return equation; }       \
+    virtual void SetReuse() { Reuse(__VA_ARGS__); }                    \
+  };                                                                   \
+  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,       \
+                    __ElemwiseOp##op_type##Maker__,                    \
+                    ::paddle::operators::ElementwiseOpInferVarType,    \
+                    op_type##GradMaker);                               \
+  REGISTER_OPERATOR(op_type##_grad,                                    \
+                    ::paddle::operators::ElementwiseOpExplicitGrad)
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 8b052611f8..bc3e95e904 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <glog/logging.h>
 #include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -65,17 +67,21 @@ inline void get_mid_dims(const framework::DDim& x_dims,
   }
 }
 
-inline void trim_trailing_singular_dims(framework::DDim* dims) {
+inline framework::DDim trim_trailing_singular_dims(
+    const framework::DDim& dims) {
   // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims->size();
+  auto actual_dims_size = dims.size();
   for (; actual_dims_size != 0; --actual_dims_size) {
-    if ((*dims)[actual_dims_size - 1] != 1) break;
+    if (dims[actual_dims_size - 1] != 1) break;
   }
-  if (actual_dims_size != dims->size()) {
-    auto actual_dims = framework::vectorize(*dims);
-    actual_dims.resize(actual_dims_size);
-    *dims = framework::make_ddim(actual_dims);
+
+  std::vector<int> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
   }
+  framework::DDim actual_dims = framework::make_ddim(trim_dims);
+  return actual_dims;
 }
 
 template <typename T, typename DeviceContext>
@@ -456,6 +462,71 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
 
 #endif
 
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseGradComputeNoBroadcast(
+    const framework::ExecutionContext& ctx, const framework::DDim& x_dim,
+    const framework::DDim& y_dim, const framework::Tensor& x,
+    const framework::Tensor& y, const framework::Tensor& out,
+    const framework::Tensor& dout, int axis, framework::Tensor* dx,
+    framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) {
+  size_t N = static_cast<size_t>(framework::product(x_dim));
+  platform::ForRange<DeviceContext> for_range(
+      ctx.template device_context<DeviceContext>(), N);
+  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
+      x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
+      dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+      dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
+}
+
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseGradComputeWithBroadcast(
+    const framework::ExecutionContext& ctx, const framework::DDim& x_dim,
+    const framework::DDim& y_dim_untrimed, const framework::Tensor& x,
+    const framework::Tensor& y, const framework::Tensor& out,
+    const framework::Tensor& dout, int axis, framework::Tensor* dx,
+    framework::Tensor* dy, DX_OP dx_op, DY_OP dy_op) {
+  axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
+  auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
+  axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
+  int pre, n, post;
+  get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
+  if (post == 1) {
+    int h = pre;
+    int w = n;
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      ElemwiseGradBroadcast1CUDA(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      ElemwiseGradBroadcast1CPU(
+          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op,
+          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  } else {
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+      ElemwiseGradBroadcast2CUDA(
+          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
+          y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
+          dy_op, dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+#endif
+    } else {
+      ElemwiseGradBroadcast2CPU(
+          x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post,
+          dx_op, dy_op,
+          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+    }
+  }
+}
+
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
 void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
                          const framework::Tensor& x, const framework::Tensor& y,
@@ -463,63 +534,50 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
                          const framework::Tensor& dout, int axis,
                          framework::Tensor* dx, framework::Tensor* dy,
                          DX_OP dx_op, DY_OP dy_op) {
+  const framework::DDim& x_dim = x.dims();
+  const framework::DDim& y_dim = y.dims();
   if (x.dims() == y.dims()) {
-    size_t N = static_cast<size_t>(framework::product(x.dims()));
-    platform::ForRange<DeviceContext> for_range(
-        ctx.template device_context<DeviceContext>(), N);
-    for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
-        x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
-        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())});
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {  // Y is a scalar
-    auto x_dim = x.dims();
-    auto y_dim = y.dims();
-
-    axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
-    trim_trailing_singular_dims(&y_dim);
-    axis = (y_dim.size() == 0) ? x_dim.size() : axis;
-
-    int pre, n, post;
-    get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
-    if (post == 1) {
-      int h = pre;
-      int w = n;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-        ElemwiseGradBroadcast1CUDA(
-            ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-            y.data<T>(), out.data<T>(), dout.data<T>(), h, w, dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-      } else {
-        ElemwiseGradBroadcast1CPU(
-            x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), h, w,
-            dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-      }
-    } else {
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef __NVCC__
-        ElemwiseGradBroadcast2CUDA(
-            ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-            y.data<T>(), out.data<T>(), dout.data<T>(), pre, n, post, dx_op,
-            dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-      } else {
-        ElemwiseGradBroadcast2CPU(
-            x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), pre, n,
-            post, dx_op, dy_op,
-            dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-            dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-      }
+    ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  }
+}
+
+// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
+// explicit gradient can cut off X, Y, Out from gradient op
+// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
+// elementwise code.
+template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP>
+void ElemwiseExplicitGradCompute(const framework::ExecutionContext& ctx,
+                                 const framework::Tensor& x,
+                                 const framework::Tensor& y,
+                                 const framework::Tensor& out,
+                                 const framework::Tensor& dout, int axis,
+                                 framework::Tensor* dx, framework::Tensor* dy,
+                                 DX_OP dx_op, DY_OP dy_op) {
+  if (dy == nullptr) {
+    const framework::DDim& dx_dims = dout.dims();
+    auto dy_dims = dx_dims;
+    ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+        ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+  } else {
+    if (dout.dims() == dy->dims()) {
+      const framework::DDim& dx_dims = dout.dims();
+      const framework::DDim& dy_dims = dy->dims();
+      ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    } else {  // Y is a scalar
+      auto dx_dims = dout.dims();
+      const framework::DDim& dy_dims = dy->dims();
+      ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
+          ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
     }
   }
 }
 
+// Deprecated
 template <typename DeviceContext, typename T, typename functor,
           typename broadcastfunctor, typename broadcast2functor>
 void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
@@ -547,7 +605,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
   }
 
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  trim_trailing_singular_dims(&y_dims);
+  trim_trailing_singular_dims(y_dims);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
@@ -574,19 +632,19 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
       x, y, z, ctx.template device_context<DeviceContext>(), func);
 
   auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+  auto y_dims_untrimed = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
                     "Rank of first input must >= rank of second input.");
 
-  if (x_dims == y_dims) {
+  if (x_dims == y_dims_untrimed) {
     functor.Run();
     return;
   }
 
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
   PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                  "Axis should be in range [0, x_dims)");
-  trim_trailing_singular_dims(&y_dims);
+  auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
index a7562b166b..b7224261e6 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -15,7 +15,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_sub, "Sub", "Out = X - Y");
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out",
+                              "X");
+
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
index fe088b8203..11c7e3fe62 100644
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -55,14 +55,15 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+    // skip out, x, y
+    auto* out = dout;
+    auto *x = dout, *y = dout;
+
+    ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
         ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
   }
 };
diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc
new file mode 100644
index 0000000000..9a297d03cf
--- /dev/null
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ExtractRowsOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ExtractRowsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ExtractRowsOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X")[0],
+                      framework::proto::VarType::SELECTED_ROWS,
+                      "The type of input(X) must be SelectedRows.");
+    auto in_dims = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim(
+        "Out", framework::make_ddim(std::vector<int64_t>{in_dims[0], 1}));
+  }
+};
+
+class ExtractRowsOp : public framework::OperatorBase {
+ public:
+  ExtractRowsOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
+    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    auto in_rows = in.rows();
+    auto out_dim = framework::make_ddim(
+        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
+    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
+
+    if (paddle::platform::is_gpu_place(in.place())) {
+#ifdef PADDLE_WITH_CUDA
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto *dev_ctx = pool.Get(in.place());
+      auto src_ptr = in_rows.Data(in.place());
+      auto stream =
+          reinterpret_cast<const platform::CUDADeviceContext &>(*dev_ctx)
+              .stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(out->place()), dst_ptr,
+                   boost::get<platform::CUDAPlace>(in.place()), src_ptr,
+                   in_rows.size() * sizeof(int64_t), stream);
+#else
+      PADDLE_THROW("Not compiled with CUDA.");
+#endif
+    } else {
+      memory::Copy(platform::CPUPlace(), dst_ptr, platform::CPUPlace(),
+                   in_rows.data(), in_rows.size() * sizeof(int64_t));
+    }
+  }
+};
+
+class ExtractRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(SelectedRows). The input tensor of extract_rows operator,"
+             " and its type is SelectedRows.");
+    AddOutput("Out", "(Tensor). The the rows of input(X).");
+
+    AddComment(R"DOC(
+    ExtractRows Operator.
+
+The function of extract_rows_op is extracting the rows from the input(X)
+whose type is SelectedRows.
+
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(extract_rows, ops::ExtractRowsOp, ops::ExtractRowsOpMaker,
+                  ops::ExtractRowsOpInferShape);
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index bcb3e63ed7..dc7ef66495 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     // get device context from pool
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
 
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
index 680fde19ee..d9cd956dfd 100644
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -36,12 +36,6 @@ class FetchBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index 1640a2a22c..c197b45e81 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
-
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
new file mode 100644
index 0000000000..fdda01381e
--- /dev/null
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FlattenOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input (X) of Flatten op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Output) of Flatten op should not be null.");
+    const auto &axis = ctx->Attrs().Get<int>("axis");
+    const auto &in_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(axis >= 0, "The axis should be greater than or equal to 0.");
+    PADDLE_ENFORCE(
+        axis <= in_dims.size(),
+        "The axis should be less than or equal to input tensor's rank.");
+
+    const auto &out_dims = GetOutputShape(axis, in_dims);
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (in_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static std::vector<int32_t> GetOutputShape(const int axis,
+                                             const framework::DDim &in_dims) {
+    int64_t outer = 1, inner = 1;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (i < axis) {
+        outer *= in_dims[i];
+      } else {
+        inner *= in_dims[i];
+      }
+    }
+    std::vector<int32_t> out_shape(2);
+    out_shape[0] = outer;
+    out_shape[1] = inner;
+    return out_shape;
+  }
+};
+
+class FlattenOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class FlattenOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) A tensor of rank >= axis.");
+    AddOutput("Out",
+              "A 2D tensor is reshaped input tensor. The input dimensions"
+              "up to axis are flattened to the outer dimension of the output"
+              "and the remaining input dimensions are flattened into the inner"
+              "dimension of the output.");
+    AddAttr<int>("axis",
+                 "(int)"
+                 "Indicate up to which input dimensions (exclusive) should be"
+                 "flattened to the outer dimension of the output. The value"
+                 "for axis must be in the range [0, R], where R is the rank of"
+                 "the input tensor. When axis = 0, the shape of the output"
+                 "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
+                 "input tensor is (d_0, d_1, ... d_n).")
+        .SetDefault(1);
+    AddComment(R"DOC(
+Flatten Operator
+
+Flattens the input tensor into a 2D matrix.
+
+Examples:
+Case 1:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    axis = 2
+  We get:
+    Out.shape = (3 * 100, 4 * 100)
+
+Case 2:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    axis = 0
+  We get:
+    Out.shape = (1, 3 * 100 * 100 * 4)
+)DOC");
+  }
+};
+
+class FlattenGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class FlattenGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(in_dims);
+    attrs["inplace"] = false;
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
+                  ops::FlattenOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused_elemwise_activation_op.cc
new file mode 100644
index 0000000000..a6fd0aeb02
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of FusedElemwiseActivationOp op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Y"),
+        "Input(Y) of FusedElemwiseActivationOp op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FusedElemwiseActivationOp op should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx.Input<framework::Tensor>("X")->type(),
+                      ctx.Input<framework::Tensor>("Y")->type(),
+                      "The element's type of input should be the same.");
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(vector<Tensor>)");
+    AddInput("Y", "(vector<Tensor>)");
+    AddOutput("Out", "vector<Tensor>");
+    AddAttr<int>("axis",
+                 "axis is used by elementwise_op, the default value is -1.")
+        .SetDefault(-1);
+    AddAttr<float>("scale",
+                   "scale is used by scale_op, the default value is 0.0.")
+        .SetDefault(0.0);
+    AddAttr<bool>("recomputation",
+                  "Whether to recompute the Out."
+                  "fused_elemwise_activation_grad has two methods to get the "
+                  "dx and dy, one "
+                  "is to use the 'Out', and the other is not to use it. "
+                  "The former method will save the time of recomputing the "
+                  "'Out', but it must occupy the memory to store the 'out'. "
+                  "While, the later method can avoid occupying the memory, "
+                  "but it must recompute the 'Out'. The default value is true.")
+        .SetDefault(true);
+    AddAttr<std::vector<std::string>>("functor_list",
+                                      "The functors that should be fused.")
+        .AddCustomChecker([&](const std::vector<std::string> &functor_list) {
+          PADDLE_ENFORCE(ValidCheck(functor_list));
+        });
+
+    AddComment(R"DOC(
+FusedElemwiseActivation Operator.
+
+At present, FusedElemwiseActivation only supports Two kinds of compound
+operators (elementwise_op and activation_op):
+
+    Z = Binary(X, Unary(Y))
+    Z = Unary(Binary(X, Y))
+
+The attributions of activation_op can be get from fused_elemwise_activation_op's
+attributions. functor_list records the functors to be fused, for example
+"scale,elementwise_add".
+
+)DOC");
+  }
+
+ private:
+  bool ValidCheck(const std::vector<std::string> &functors) {
+    std::unordered_set<std::string> unary_fun = {"scale", "relu"};
+    std::unordered_set<std::string> binary_fun = {"elementwise_add"};
+
+    std::string unary_fun_str;
+    if (binary_fun.count(functors[0])) {
+      unary_fun_str = functors[1];
+    } else if (binary_fun.count(functors[1])) {
+      unary_fun_str = functors[0];
+    } else {
+      PADDLE_THROW("%s and %s are not included in fused_list.", functors[0],
+                   functors[1]);
+    }
+    PADDLE_ENFORCE_EQ(unary_fun.count(unary_fun_str), 1,
+                      "%s is not included in fused_list.", unary_fun_str);
+    return true;
+  }
+};
+
+class FusedElemwiseActivationGradMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType(this->ForwardOpType() + "_grad");
+
+    for (auto &input_param : this->InputNames()) {
+      op_desc_ptr->SetInput(input_param, this->Input(input_param));
+      op_desc_ptr->SetOutput(framework::GradVarName(input_param),
+                             this->InputGrad(input_param, true));
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      op_desc_ptr->SetInput(output_param, this->Output(output_param));
+      op_desc_ptr->SetInput(framework::GradVarName(output_param),
+                            this->OutputGrad(output_param));
+    }
+    op_desc_ptr->SetAttrMap(this->Attrs());
+
+    std::vector<std::string> functor_names =
+        boost::get<std::vector<std::string>>(
+            op_desc_ptr->GetAttr("functor_list"));
+    functor_names[0] += "_grad";
+    functor_names[1] += "_grad";
+    op_desc_ptr->SetAttr("functor_list", functor_names);
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type_index = ctx.Input<framework::Tensor>("X")->type();
+    PADDLE_ENFORCE_EQ(input_data_type_index,
+                      ctx.Input<framework::Tensor>("Y")->type(),
+                      "The element's type of input should be the same.");
+    PADDLE_ENFORCE_EQ(
+        input_data_type_index,
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        "The element's type of input should be the same.");
+
+    auto input_data_type = framework::ToDataType(input_data_type_index);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_elemwise_activation, ops::FusedElemwiseActivationOp,
+                  ops::FusedElemwiseActivationMaker,
+                  ops::FusedElemwiseActivationGradMaker);
+REGISTER_OPERATOR(fused_elemwise_activation_grad,
+                  ops::FusedElemwiseActivationOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    fused_elemwise_activation,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
+                                       float>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
+                                       double>);
+
+REGISTER_OP_CPU_KERNEL(
+    fused_elemwise_activation_grad,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           float>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
+                                           double>);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused_elemwise_activation_op.cu
new file mode 100644
index 0000000000..e1d2b16b4b
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused_elemwise_activation_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fused_elemwise_activation,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       float>,
+    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+                                       double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_elemwise_activation_grad,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+                                           double>);
diff --git a/paddle/fluid/operators/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused_elemwise_activation_op.h
new file mode 100644
index 0000000000..fe0017b824
--- /dev/null
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
@@ -0,0 +1,425 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/functors.h"
+
+namespace math = paddle::operators::math;
+
+namespace paddle {
+namespace operators {
+
+// CompoundFunctors
+// For example: Z = Binary(X, Unary(Y))
+template <typename T, typename BinaryFun, typename UnaryFun>
+struct BinaryCompoundFunctor {
+  BinaryCompoundFunctor(const BinaryFun &binary_fun, const UnaryFun &unary_fun)
+      : binary_fun_(binary_fun), unary_fun_(unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y) {
+    return binary_fun_(x, unary_fun_(y));
+  }
+
+ private:
+  BinaryFun binary_fun_;
+  UnaryFun unary_fun_;
+};
+
+// For example: Z = Unary(Binary(X, Y))
+template <typename T, typename UnaryFun, typename BinaryFun>
+struct UnaryCompoundFunctor {
+  UnaryCompoundFunctor(const UnaryFun &unary_fun, const BinaryFun &binary_fun)
+      : unary_fun_(unary_fun), binary_fun_(binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y) {
+    return unary_fun_(binary_fun_(x, y));
+  }
+
+ private:
+  UnaryFun unary_fun_;
+  BinaryFun binary_fun_;
+};
+
+// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get
+// the dx, one is to use the 'out', and the other is not to use it.
+// the former method will save the time of recomputing the
+// 'out', but it must occupy the memory to store the 'out'.
+// While the later method can avoid occupying this memory,
+// but it must recompute the 'out'.
+
+template <typename T, typename DBinaryFun, typename UnaryFun,
+          bool Recomputation = true>
+struct BinaryCompoundGradDxFunctor {
+  BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun)
+      : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    if (Recomputation) {
+      return dout * d_binary_fun_(x, unary_fun_(y));
+    } else {
+      return dout * d_binary_fun_(x, unary_fun_(y), out);
+    }
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+};
+
+template <typename T, typename DBinaryFun, typename UnaryFun,
+          typename DUnaryFun, bool Recomputation = true>
+struct BinaryCompoundGradDyFunctor {
+  BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun,
+                              const UnaryFun &unary_fun,
+                              const DUnaryFun &d_unary_fun)
+      : d_binary_fun_(d_binary_fun),
+        unary_fun_(unary_fun),
+        d_unary_fun_(d_unary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    if (Recomputation) {
+      return dout * d_binary_fun_(unary_fun_(y), x) * d_unary_fun_(y);
+    } else {
+      return dout * d_binary_fun_(unary_fun_(y), x, out) * d_unary_fun_(y);
+    }
+  }
+
+ private:
+  DBinaryFun d_binary_fun_;
+  UnaryFun unary_fun_;
+  DUnaryFun d_unary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDxFunctor {
+  UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_(x, y);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+template <typename T, typename DUnaryFun, typename BinaryFun,
+          typename DBinaryFun, bool Recomputation = true>
+struct UnaryCompoundGradDyFunctor {
+  UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun,
+                             const BinaryFun &binary_fun,
+                             const DBinaryFun &d_binary_fun)
+      : d_unary_fun_(d_unary_fun),
+        binary_fun_(binary_fun),
+        d_binary_fun_(d_binary_fun) {}
+
+  inline HOSTDEVICE T operator()(T x, T y, T out, T dout) {
+    T base;
+    if (Recomputation) {
+      base = dout * d_unary_fun_(binary_fun_(x, y));
+    } else {
+      base = dout * d_unary_fun_(binary_fun_(x, y), out);
+    }
+    return base * d_binary_fun_(y, x);
+  }
+
+ private:
+  DUnaryFun d_unary_fun_;
+  BinaryFun binary_fun_;
+  DBinaryFun d_binary_fun_;
+};
+
+template <typename DeviceContext, typename T, typename BinaryFunctor,
+          typename UnaryFunctor>
+static void RunBinaryCompoundFunctor(const framework::ExecutionContext &ctx,
+                                     const BinaryFunctor &binary_functor,
+                                     const UnaryFunctor &unary_functor,
+                                     const framework::Tensor *in_x,
+                                     const framework::Tensor *in_y,
+                                     framework::Tensor *output) {
+  int axis = ctx.Attr<int>("axis");
+  using BinaryCompoundFunctor =
+      BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>;
+
+  ElementwiseComputeEx<BinaryCompoundFunctor, DeviceContext, T>(
+      ctx, in_x, in_y, axis,
+      BinaryCompoundFunctor(binary_functor, unary_functor), output);
+}
+
+template <typename DeviceContext, typename T, typename UnaryFunctor,
+          typename BinaryFunctor>
+static void RunUnaryCompoundFunctors(const framework::ExecutionContext &ctx,
+                                     const UnaryFunctor &unary_functor,
+                                     const BinaryFunctor &binary_functor,
+                                     const framework::Tensor *in_x,
+                                     const framework::Tensor *in_y,
+                                     framework::Tensor *output) {
+  int axis = ctx.Attr<int>("axis");
+
+  using UnaryCompoundFunctor =
+      UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>;
+
+  ElementwiseComputeEx<UnaryCompoundFunctor, DeviceContext, T>(
+      ctx, in_x, in_y, axis,
+      UnaryCompoundFunctor(unary_functor, binary_functor), output);
+}
+
+template <typename DeviceContext, typename T, typename BinaryGradFunctor,
+          typename UnaryFunctor, typename UnaryGradFunctor,
+          bool Recomputation = true>
+static void RunBinaryCompoundGradFunctors(
+    const framework::ExecutionContext &ctx,
+    const BinaryGradFunctor &binary_grad_functor,
+    const UnaryFunctor &unary_functor,
+    const UnaryGradFunctor &unary_grad_functor, const framework::Tensor *in_x,
+    const framework::Tensor *in_y, const framework::Tensor *in_out,
+    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
+    framework::Tensor *y_grad) {
+  int axis = ctx.Attr<int>("axis");
+
+  using BinaryCompoundDxFunctor =
+      BinaryCompoundGradDxFunctor<T, BinaryGradFunctor, UnaryFunctor,
+                                  Recomputation>;
+  using BinaryCompoundDyFunctor =
+      BinaryCompoundGradDyFunctor<T, BinaryGradFunctor, UnaryFunctor,
+                                  UnaryGradFunctor, Recomputation>;
+
+  ElemwiseGradCompute<DeviceContext, T, BinaryCompoundDxFunctor,
+                      BinaryCompoundDyFunctor>(
+      ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad,
+      BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
+      BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
+                              unary_grad_functor));
+}
+
+template <typename DeviceContext, typename T, typename UnaryGradFunctor,
+          typename BinaryFunctor, typename BinaryGradFunctor,
+          bool Recomputation = true>
+static void RunUnaryCompoundGradFunctors(
+    const framework::ExecutionContext &ctx,
+    const UnaryGradFunctor &unary_grad_functor,
+    const BinaryFunctor &binary_functor,
+    const BinaryGradFunctor &binary_grad_functor, const framework::Tensor *in_x,
+    const framework::Tensor *in_y, const framework::Tensor *in_out,
+    const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
+    framework::Tensor *y_grad) {
+  int axis = ctx.Attr<int>("axis");
+
+  using UnaryCompoundDxFunctor =
+      UnaryCompoundGradDxFunctor<T, UnaryGradFunctor, BinaryFunctor,
+                                 BinaryGradFunctor, Recomputation>;
+  using UnaryCompoundDyFunctor =
+      UnaryCompoundGradDyFunctor<T, UnaryGradFunctor, BinaryFunctor,
+                                 BinaryGradFunctor, Recomputation>;
+
+  ElemwiseGradCompute<DeviceContext, T, UnaryCompoundDxFunctor,
+                      UnaryCompoundDyFunctor>(
+      ctx, *in_x, *in_y, *in_out, *in_out_grad, axis, x_grad, y_grad,
+      UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
+                             binary_grad_functor),
+      UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
+                             binary_grad_functor));
+}
+
+template <typename DeviceContext, typename T>
+static void RunFunctors(const framework::ExecutionContext &ctx,
+                        const framework::Tensor *in_x,
+                        const framework::Tensor *in_y,
+                        framework::Tensor *output) {
+  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+  auto funcs_str = functors[0] + "," + functors[1];
+  // TODO(zcd): The following code can be refined.
+  if (funcs_str == "elementwise_add,scale") {
+    // Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunBinaryCompoundFunctor<DeviceContext, T, math::AddFunctor<T>,
+                             math::ScaleFunctor<T>>(
+        ctx, math::AddFunctor<T>(), math::ScaleFunctor<T>(scale), in_x, in_y,
+        output);
+  } else if (funcs_str == "scale,elementwise_add") {
+    // Z = Unary(Binary(X, Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    RunUnaryCompoundFunctors<DeviceContext, T, math::ScaleFunctor<T>,
+                             math::AddFunctor<T>>(
+        ctx, math::ScaleFunctor<T>(scale), math::AddFunctor<T>(), in_x, in_y,
+        output);
+  } else if (funcs_str == "elementwise_add,relu") {
+    RunBinaryCompoundFunctor<DeviceContext, T, math::AddFunctor<T>,
+                             math::ReluFunctor<T>>(
+        ctx, math::AddFunctor<T>(), math::ReluFunctor<T>(), in_x, in_y, output);
+  } else if (funcs_str == "relu,elementwise_add") {
+    RunUnaryCompoundFunctors<DeviceContext, T, math::ReluFunctor<T>,
+                             math::AddFunctor<T>>(
+        ctx, math::ReluFunctor<T>(), math::AddFunctor<T>(), in_x, in_y, output);
+  } else {
+    PADDLE_THROW("%s has not been implemented.", funcs_str);
+  }
+}
+
+template <typename DeviceContext, typename T>
+static void RunGradFunctors(const framework::ExecutionContext &ctx,
+                            const framework::Tensor *in_x,
+                            const framework::Tensor *in_y,
+                            const framework::Tensor *in_out,
+                            const framework::Tensor *in_out_grad,
+                            framework::Tensor *x_grad,
+                            framework::Tensor *y_grad) {
+  auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
+  auto funcs_str = functors[0] + "," + functors[1];
+
+  bool recomputation = ctx.Attr<bool>("recomputation");
+
+  // TODO(zcd): The following code can be refined. for example, use registion
+  if (funcs_str == "elementwise_add_grad,scale_grad") {
+    // The backward of Z = Binary(X, Unary(Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    if (recomputation) {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ScaleFunctor<T>,
+                                    math::ScaleGradFunctor<T>, true>(
+          ctx, math::AddGradFunctor<T>(), math::ScaleFunctor<T>(scale),
+          math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, in_out_grad,
+          x_grad, y_grad);
+    } else {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ScaleFunctor<T>,
+                                    math::ScaleGradFunctor<T>, false>(
+          ctx, math::AddGradFunctor<T>(), math::ScaleFunctor<T>(scale),
+          math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, in_out_grad,
+          x_grad, y_grad);
+    }
+  } else if (funcs_str == "scale_grad,elementwise_add_grad") {
+    // The backward of Z = Unary(Binary(X, Y))
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    if (recomputation) {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ScaleGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   true>(ctx, math::ScaleGradFunctor<T>(scale),
+                                         math::AddFunctor<T>(),
+                                         math::AddGradFunctor<T>(), in_x, in_y,
+                                         in_out, in_out_grad, x_grad, y_grad);
+    } else {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ScaleGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   false>(ctx, math::ScaleGradFunctor<T>(scale),
+                                          math::AddFunctor<T>(),
+                                          math::AddGradFunctor<T>(), in_x, in_y,
+                                          in_out, in_out_grad, x_grad, y_grad);
+    }
+  } else if (funcs_str == "elementwise_add_grad,relu_grad") {
+    if (recomputation) {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ReluFunctor<T>,
+                                    math::ReluGradFunctor<T>, true>(
+          ctx, math::AddGradFunctor<T>(), math::ReluFunctor<T>(),
+          math::ReluGradFunctor<T>(), in_x, in_y, in_out, in_out_grad, x_grad,
+          y_grad);
+    } else {
+      RunBinaryCompoundGradFunctors<DeviceContext, T, math::AddGradFunctor<T>,
+                                    math::ReluFunctor<T>,
+                                    math::ReluGradFunctor<T>, false>(
+          ctx, math::AddGradFunctor<T>(), math::ReluFunctor<T>(),
+          math::ReluGradFunctor<T>(), in_x, in_y, in_out, in_out_grad, x_grad,
+          y_grad);
+    }
+  } else if (funcs_str == "relu_grad,elementwise_add_grad") {
+    if (recomputation) {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ReluGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   true>(ctx, math::ReluGradFunctor<T>(),
+                                         math::AddFunctor<T>(),
+                                         math::AddGradFunctor<T>(), in_x, in_y,
+                                         in_out, in_out_grad, x_grad, y_grad);
+    } else {
+      RunUnaryCompoundGradFunctors<DeviceContext, T, math::ReluGradFunctor<T>,
+                                   math::AddFunctor<T>, math::AddGradFunctor<T>,
+                                   false>(ctx, math::ReluGradFunctor<T>(),
+                                          math::AddFunctor<T>(),
+                                          math::AddGradFunctor<T>(), in_x, in_y,
+                                          in_out, in_out_grad, x_grad, y_grad);
+    }
+  } else {
+    PADDLE_THROW("%s has not been implemented.", funcs_str);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "X", ctx.op().Input("X"));
+    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "Y", ctx.op().Input("Y"));
+    auto &output = detail::Ref(ctx.Output<framework::Tensor>("Out"),
+                               "Cannot get input tensor %s, variable name = %s",
+                               "Out", ctx.op().Output("Out"));
+
+    RunFunctors<DeviceContext, T>(ctx, &in_x, &in_y, &output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "X", ctx.op().Input("X"));
+    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
+                             "Cannot get input tensor %s, variable name = %s",
+                             "Y", ctx.op().Input("Y"));
+    auto &in_out = detail::Ref(ctx.Input<framework::Tensor>("Out"),
+                               "Cannot get input tensor %s, variable name = %s",
+                               "Out", ctx.op().Input("Out"));
+    auto &in_out_grad =
+        detail::Ref(ctx.Input<framework::Tensor>(framework::GradVarName("Out")),
+                    "Cannot get input tensor %s, variable name = %s",
+                    framework::GradVarName("Out"),
+                    ctx.op().Input(framework::GradVarName("Out")));
+
+    framework::Tensor *x_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    framework::Tensor *y_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+
+    RunGradFunctors<DeviceContext, T>(ctx, &in_x, &in_y, &in_out, &in_out_grad,
+                                      x_grad, y_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 438b44b42a..b194807696 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -19,11 +19,12 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "gflags/gflags.h"
+
 #include "paddle/fluid/operators/detail/macros.h"
 
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -103,6 +104,7 @@ void ListenAndServOp::RunSyncLoop(
     framework::Scope *recv_scope,
     const std::vector<int> &prefetch_block_id_list,
     const int checkpoint_point_block_id) const {
+  VLOG(2) << "RunSyncLoop";
   size_t num_blocks = program->Size();
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -122,7 +124,9 @@ void ListenAndServOp::RunSyncLoop(
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
   rpc_service_->ResetBarrierCounter();
+
   while (true) {
+    rpc_service_->Profiler().OneStep();
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
     rpc_service_->SetCond(distributed::kRequestSend);
@@ -170,6 +174,7 @@ void ListenAndServOp::RunSyncLoop(
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                    framework::ProgramDesc *program,
                                    framework::Scope *recv_scope) const {
+  VLOG(2) << "RunAsyncLoop";
   // grad name to block id
   std::unordered_map<std::string, int32_t> grad_to_block_id;
   std::unordered_map<int32_t, std::string> id_to_grad;
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index ac35cf0b89..27e26cb1b5 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
-
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index bda4994322..d77b095c5d 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -32,20 +32,21 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
     auto table_dims = ctx->GetInputDim("W");
     auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();
 
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W
-    // and it must be a column vector with rank = 2 while the 2nd dimension
-    // size must be 1, when Ids's type is SelectedRows, the rows of Ids
-    // contains the ids to be looked up in W;
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
-    }
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
+                      "The last dimension of the 'Ids' tensor must be 1.");
+
+    auto output_dims =
+        framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
 
-    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
-    ctx->ShareLoD("Ids", /*->*/ "Out");
+    if (ctx->GetOutputsVarType("Out")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      ctx->ShareLoD("Ids", /*->*/ "Out");
+    }
   }
 
  protected:
@@ -62,17 +63,11 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("W",
              "(Tensor) The input represents embedding tensors, "
              "which is a learnable parameter.");
-    AddInput(
-        "Ids",
-        "(Tensor or SelectedRows) Ids's type can be Tensor or "
-        "SelectedRows, when Ids's type is Tensor, this tensor contains "
-        "the ids to be looked up in W and it must be a column vector with "
-        "rank = 2 while the 2nd dimension size must be 1; when Ids's type is "
-        "SelectedRows, the rows of Ids contains the ids to be looked up "
-        "in W.");
-    AddOutput("Out",
-              "(Tensor or SelectedRows) The lookup results, which have the "
-              "same type as W.");
+    AddInput("Ids",
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "The last dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
     AddAttr<bool>("is_sparse",
                   "(boolean, default false) "
                   "Sparse update.")
@@ -90,15 +85,10 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 Lookup Table Operator.
 
 This operator is used to perform lookups on the parameter W,
-then concatenated into a dense or sparse tensor.
-
-The type of Ids(Input) is SelectedRows, Tensor or LoDTensor, when Ids's
-type is SelectedRows, the rows of Ids contains the ids to be looked up in W;
-when Ids's type is Tensor, this tensor contains the ids to be looked up in W
-and it must be a column vector with rank = 2 while the 2nd dimension size must be 1,
-at this time, Ids can carry the LoD (Level of Details) information, or not, and
-the output only shares the LoD information with input Ids.
+then concatenated into a dense tensor.
 
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 77722c50d3..74823dab09 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -23,7 +23,7 @@ namespace operators {
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
           bool PaddingFlag>
-__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
+__global__ void LookupTable(T *output, const T *table, const int64_t *ids,
                             const int64_t N, const int64_t K, const int64_t D,
                             const int64_t padding_idx) {
   int idx = threadIdx.x;
@@ -33,8 +33,8 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
     int64_t id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
-    T* out = output + idy * D;
-    const T* tab = table + id * D;
+    T *out = output + idy * D;
+    const T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
       if (PaddingFlag) {
         if (id == padding_idx)
@@ -50,7 +50,7 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
 }
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
-__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
+__global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
                                 const int64_t N, const int64_t K,
                                 const int64_t D) {
   int idx = threadIdx.x;
@@ -60,8 +60,8 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
     int id = ids[idy];
     PADDLE_ASSERT(id >= 0);
     PADDLE_ASSERT(id < N);
-    const T* out = output + idy * D;
-    T* tab = table + id * D;
+    const T *out = output + idy * D;
+    T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
       paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
     }
@@ -72,36 +72,19 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
 template <typename T>
 class LookupTableCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_t = context.Input<LoDTensor>("W");
+    auto *ids_t = context.Input<LoDTensor>("Ids");
+    auto *output_t = context.Output<LoDTensor>("Out");
     int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
-
-    int64_t* ids;
-    int64_t K;
-
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W;
-    // when Ids's type is SelectedRows, the rows of Ids contains the
-    // ids to be looked up in W.
-    if (ids_var->IsType<framework::LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
-      K = ids_t->numel();
-    } else if (ids_var->IsType<framework::SelectedRows>()) {
-      auto* ids_t = context.Input<framework::SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().CUDAData(context.GetPlace()));
-      K = ids_t->rows().size();
-      output_t->Resize({K, table_t->dims()[1]});
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Ids");
-    }
 
     size_t N = table_t->dims()[0];
     size_t D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+    size_t K = ids_t->numel();
+
+    auto *ids = ids_t->data<int64_t>();
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
@@ -122,41 +105,44 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *table = context.Input<LoDTensor>("W");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      auto *ids_data = ids->data<int64_t>();
+      int64_t ids_num = ids->numel();
 
       auto stream = dev_ctx.stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
-      new_rows.resize(ids_dim[0]);
+      new_rows.resize(ids_num);
       auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
       // TODO(yuyang18): Strange code here.
       memory::Copy(platform::CPUPlace(),
                    new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
-                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
+                   ids_data, ids_num * sizeof(int64_t), stream);
 
       d_table->set_rows(new_rows);
 
-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_num, table->dims()[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
-      auto* d_table_data = d_table_value->data<T>();
-      auto* d_output_data = d_output->data<T>();
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto *d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
       memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
                    d_output->numel() * sizeof(T), stream);
 
@@ -168,9 +154,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       int N = d_table_t->dims()[0];
       int D = d_table_t->dims()[1];
       int K = ids_t->numel();
-      const int64_t* ids = ids_t->data<int64_t>();
-      const T* d_output = d_output_t->data<T>();
-      T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+      const int64_t *ids = ids_t->data<int64_t>();
+      const T *d_output = d_output_t->data<T>();
+      T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
 
       auto t = framework::EigenVector<T>::Flatten(*d_table_t);
       t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index d482506bf0..f5c10ced83 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -36,43 +36,13 @@ template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
+    auto *ids_t = context.Input<LoDTensor>("Ids");      // int tensor
+    auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
     auto *table_var = context.InputVar("W");
-    auto *ids_var = context.InputVar("Ids");
-    Tensor *output_t = context.Output<Tensor>("Out");
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-
-    DDim table_dim;
 
-    if (table_var->IsType<LoDTensor>()) {
-      table_dim = context.Input<LoDTensor>("W")->dims();
-    } else if (table_var->IsType<SelectedRows>()) {
-      auto *table_t = context.Input<SelectedRows>("W");
-      table_dim = table_t->value().dims();
-    } else {
-      PADDLE_THROW(
-          "The parameter W of a LookupTable "
-          "must be either LoDTensor or SelectedRows");
-    }
-
-    int64_t *ids;
-    int64_t ids_numel;
-
-    // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
-    // is LoDTensor, this tensor contains the ids to be looked up in W;
-    // when Ids's type is SelectedRows, the rows of Ids contains the
-    // ids to be looked up in W.
-    if (ids_var->IsType<LoDTensor>()) {
-      auto *ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-      ids_numel = ids_t->numel();
-    } else if (ids_var->IsType<SelectedRows>()) {
-      auto *ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t *>(ids_t->rows().data());
-      ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_dim[1]});
-    } else {
-      PADDLE_THROW("Unsupported Variable Type of Ids");
-    }
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
+    int64_t ids_numel = ids_t->numel();
 
     if (table_var->IsType<LoDTensor>()) {
       auto *table_t = context.Input<LoDTensor>("W");
@@ -139,17 +109,17 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
+      int64_t ids_num = ids->numel();
 
       framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_dim[0]);
-      for (int64_t i = 0; i < ids_dim[0]; i++) {
+      new_rows.reserve(ids_num);
+      for (int64_t i = 0; i < ids_num; i++) {
         new_rows.push_back(ids_data[i]);
       }
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table_dim[1]});
+      d_table_value->Resize({ids_num, table_dim[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
       d_table->set_height(table_dim[0]);
@@ -157,7 +127,10 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table_value->data<T>();
 
-      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      auto d_output_dims = d_output->dims();
+      PADDLE_ENFORCE_EQ(
+          d_table_value->dims(),
+          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
     } else {
       auto *ids = context.Input<LoDTensor>("Ids");
@@ -165,10 +138,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
 
       auto *ids_data = ids->data<int64_t>();
-      auto ids_dim = ids->dims();
 
       int N = table_dim[0];
-      int D = d_output->dims()[1];
+      int D = table_dim[1];
 
       auto *d_output_data = d_output->data<T>();
       auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h
new file mode 100644
index 0000000000..ad2f49ccbf
--- /dev/null
+++ b/paddle/fluid/operators/math/functors.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// AddFunctor
+template <typename T>
+struct AddFunctor {
+  // out = x + y;
+  inline HOSTDEVICE T operator()(T x, T y) { return x + y; }
+};
+
+template <typename T>
+struct AddGradFunctor {
+  inline HOSTDEVICE T operator()(T x, T y) { return 1; }
+
+  inline HOSTDEVICE T operator()(T x, T y, T out) const { return 1; }
+};
+
+template <typename T>
+struct ScaleFunctor {
+  explicit ScaleFunctor(const T coeff) : coeff_(coeff) {}
+
+  inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; }
+
+ private:
+  T coeff_;
+};
+
+template <typename T>
+struct ScaleGradFunctor {
+  explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {}
+
+  inline HOSTDEVICE T operator()(T x) { return coeff_; }
+
+  inline HOSTDEVICE T operator()(T x, T out) { return coeff_; }
+
+ private:
+  T coeff_;
+};
+
+template <typename T>
+struct ReluFunctor {
+  inline HOSTDEVICE T operator()(T x) { return x * (x > 0); }
+};
+
+template <typename T>
+struct ReluGradFunctor {
+  inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; }
+
+  inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index a50b9ace39..1472edbbf4 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <vector>
+#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
 namespace paddle {
 namespace operators {
@@ -35,36 +36,18 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col->dims().size() == 5);
 
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
-
-    int channels_col = im_channels * filter_height * filter_width;
-
-    const T* im_data = im.data<T>();
-    T* col_data = col->data<T>();
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          int col_idx = (c * col_height + h) * col_width + w;
-          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
-          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                               im_col_idx < 0 || im_col_idx >= im_width)
-                                  ? static_cast<T>(0)
-                                  : im_data[im_idx];
-        }
+    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
+        dilation[1] == 1) {
+      if (padding[0] == 0 && padding[1] == 0) {
+        im2col_sh1sw1dh1dw1ph0pw0<T>(im, col);
+        return;
+      } else if (padding[0] == 1 && padding[1] == 1) {
+        im2col_sh1sw1dh1dw1ph1pw1<T>(im, col);
+        return;
       }
+      // TODO(TJ): complete padding >=2
     }
+    im2col_common<T>(im, dilation, stride, padding, col);
   }
 };
 
diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h
new file mode 100644
index 0000000000..0d32bc5bd0
--- /dev/null
+++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h
@@ -0,0 +1,252 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/**
+ * The most common im2col algorithm.
+ * Support dilation, stride and padding.
+ */
+template <typename T>
+inline void im2col_common(const framework::Tensor& im,
+                          const std::vector<int>& dilation,
+                          const std::vector<int>& stride,
+                          const std::vector<int>& padding,
+                          framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+  int channels_col = im_channels * filter_height * filter_width;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % filter_width;
+    int h_offset = (c / filter_width) % filter_height;
+    int c_im = c / (filter_width * filter_height);
+    for (int h = 0; h < output_height; ++h) {
+      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+      for (int w = 0; w < output_width; ++w) {
+        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+        int col_idx = (c * output_height + h) * output_width + w;
+        int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                             im_col_idx < 0 || im_col_idx >= im_width)
+                                ? static_cast<T>(0)
+                                : im_data[im_idx];
+      }
+    }
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 0
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im,
+                                      framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int col_matrix_width = output_width * output_height;
+  int im_size = im_height * im_width;
+  size_t copy_size = sizeof(T) * output_width;
+  const T* im_data_oh = im_data;
+  T* dst_data_oh = col_data;
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* src_data_ic = im_data_oh;
+    T* dst_data = dst_data_oh;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = src_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          std::memcpy(dst_data, src_data + kw, copy_size);
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+      src_data_ic = src_data_ic + im_size;
+    }
+    im_data_oh = im_data_oh + im_width;
+    dst_data_oh = dst_data_oh + output_width;
+  }
+}
+
+/**
+ * im2col algorithm with strides == 1, dilations == 1, paddings == 1
+ * and filter_width == 1 have a special implementation
+ */
+template <typename T>
+inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im,
+                                      framework::Tensor* col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int output_height = col->dims()[3];
+  int output_width = col->dims()[4];
+
+  constexpr int plh = 1;
+  constexpr int prh = 1;
+  constexpr int plw = 1;
+  constexpr int prw = 1;
+
+  const T* im_data = im.data<T>();
+  T* col_data = col->data<T>();
+  int im_size = im_height * im_width;
+  int col_matrix_width = output_width * output_height;
+  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
+  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
+
+  // fill height padding
+  {
+    size_t copy_size = sizeof(T) * output_width;
+    T* col_start_l = col_data;
+    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
+                     col_matrix_width - output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_l = col_start_l;
+      T* dst_data_r = col_start_r;
+      for (int kw = 0; kw < filter_width; ++kw) {
+        std::memset(dst_data_l, 0, copy_size);
+        std::memset(dst_data_r, 0, copy_size);
+        dst_data_l = dst_data_l + col_matrix_width;
+        dst_data_r = dst_data_r + col_matrix_width;
+      }
+      col_start_l = col_start_l + col_block_ic;
+      col_start_r = col_start_r + col_block_ic;
+    }
+  }
+
+  auto pad = static_cast<T>(0);
+  if (filter_width == 1) {
+    // fill width padding
+    T* dst_data_ic = col_data;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      T* dst_data_kh = dst_data_ic;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        T* dst_data = dst_data_kh;
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width - 1;
+          *dst_data = pad;
+          ++dst_data;
+        }
+        dst_data_kh = dst_data_kh + col_block_fh;
+      }
+      dst_data_ic = dst_data_ic + col_block_ic;
+    }
+    // fill core
+    size_t copy_size = sizeof(T) * (output_width - plw - prw);
+    for (int oh = 0; oh < output_height; ++oh) {
+      const T* im_data_start =
+          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+      T* dst_data = col_data + oh * output_width;
+      for (int ic = 0; ic < im_channels; ++ic) {
+        const T* src_data = im_data_start + ic * im_size;
+        for (int kh = 0; kh < filter_height; ++kh) {
+          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                         kh > (filter_height - prh - 1))) {
+            dst_data = dst_data + col_matrix_width;
+            continue;
+          }
+          std::memcpy(dst_data + plw, src_data, copy_size);
+          dst_data = dst_data + col_matrix_width;
+          src_data = src_data + im_width;
+        }
+      }
+    }
+    return;
+  }
+
+  // filter_width != 1
+  // fill width padding
+  T* dst_data_ic = col_data;
+  for (int ic = 0; ic < im_channels; ++ic) {
+    T* dst_data_kh = dst_data_ic;
+    for (int kh = 0; kh < filter_height; ++kh) {
+      for (T* dst_data :
+           {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width +
+                             output_width - 1}) {
+        // TODO(TJ): from plh, saving repeated assignment
+        for (int oh = 0; oh < output_height; ++oh) {
+          *dst_data = pad;
+          dst_data = dst_data + output_width;
+        }
+      }
+      dst_data_kh = dst_data_kh + col_block_fh;
+    }
+    dst_data_ic = dst_data_ic + col_block_ic;
+  }
+
+  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
+  // (output_width-1)}
+  // length of copy_size is equal kw.
+  for (int oh = 0; oh < output_height; ++oh) {
+    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
+    T* dst_data = col_data + oh * output_width;
+    for (int ic = 0; ic < im_channels; ++ic) {
+      const T* src_data = im_data_start + ic * im_size;
+      for (int kh = 0; kh < filter_height; ++kh) {
+        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
+                                       kh > (filter_height - prh - 1))) {
+          dst_data = dst_data + filter_width * col_matrix_width;
+          continue;
+        }
+        // TODO(TJ): reuse plw-kw outside this for
+        // try to unify
+        for (int kw = 0; kw < plw; ++kw) {
+          std::memcpy(dst_data + (plw - kw), src_data,
+                      sizeof(T) * (output_width - (plw - kw)));
+          dst_data = dst_data + col_matrix_width;
+        }
+        for (int kw = plw; kw < filter_width - prw; ++kw) {
+          std::memcpy(dst_data, src_data + (kw - plw),
+                      sizeof(T) * output_width);
+          dst_data = dst_data + col_matrix_width;
+        }
+        int i = 1;
+        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
+          std::memcpy(dst_data, src_data + (kw - plw),
+                      sizeof(T) * (output_width - i));
+          dst_data = dst_data + col_matrix_width;
+        }
+        src_data = src_data + im_width;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 8e3f0f2868..ae2c90b33a 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
+#include <sys/time.h>
 #include <vector>
+#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -167,3 +169,104 @@ TEST(math, im2col) {
              paddle::platform::CUDAPlace>();
 #endif
 }
+
+#define PREPARE_IM2COL_CPU                                                   \
+  paddle::platform::CPUPlace place;                                          \
+  paddle::platform::CPUDeviceContext context(place);                         \
+  paddle::framework::Tensor input;                                           \
+  paddle::framework::Tensor out;                                             \
+  paddle::framework::Tensor ref;                                             \
+  std::vector<int> padding({ph, pw});                                        \
+  std::vector<int> stride({1, 1});                                           \
+  std::vector<int> dilation({1, 1});                                         \
+  float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place);         \
+  for (int i = 0; i < input.numel(); ++i) {                                  \
+    input_ptr[i] = static_cast<float>(i + 1);                                \
+  }                                                                          \
+  int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1;            \
+  int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1;             \
+  out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
+  ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
+  paddle::operators::math::Im2ColFunctor<                                    \
+      paddle::operators::math::ColFormat::kCFO,                              \
+      paddle::platform::CPUDeviceContext, float>                             \
+      im2col
+
+void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
+  PREPARE_IM2COL_CPU;
+
+  im2col(context, input, dilation, stride, padding, &out);
+  paddle::operators::math::im2col_common<float>(input, dilation, stride,
+                                                padding, &ref);
+
+  float* ref_data = ref.data<float>();
+  float* out_data = out.data<float>();
+  for (int i = 0; i < out.numel(); ++i) {
+    EXPECT_EQ(out_data[i], ref_data[i]);
+  }
+}
+
+void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
+  PREPARE_IM2COL_CPU;
+  constexpr int repeat = 100;
+  auto GetCurrentMs = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+  };
+  auto t1 = GetCurrentMs();
+  for (int i = 0; i < repeat; ++i) {
+    im2col(context, input, dilation, stride, padding, &out);
+  }
+  auto t2 = GetCurrentMs();
+
+  for (int i = 0; i < repeat; ++i) {
+    paddle::operators::math::im2col_common<float>(input, dilation, stride,
+                                                  padding, &ref);
+  }
+  auto t3 = GetCurrentMs();
+
+  LOG(INFO) << "before: " << (t3 - t2) / repeat
+            << ",after: " << (t2 - t1) / repeat
+            << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%";
+}
+
+TEST(math, im2col_cputest) {
+  // padding_h == padding_w
+  for (int p = 0; p < 4; ++p) {
+    // width == height
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p,
+                  /*pw*/ p);
+
+    // height != width
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+
+    // filter == 1
+    testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+    testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p,
+                  /*pw*/ p);
+  }
+
+  // padding_h != padding_w
+  testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1,
+                /*pw*/ 2);
+
+  // benchmark
+  for (int p : {0, 1}) {
+    for (int k : {1, 3, 5}) {
+      LOG(INFO) << "padding == " << p << ", filter == " << k;
+      benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k,
+                  /*ph*/ p, /*pw*/ p);
+    }
+  }
+}
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index a579182ec1..3effe77625 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -52,7 +52,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward(
+  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxForward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
@@ -83,7 +83,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
+  CUDNN_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index c9744db3d0..eb09470f37 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -164,14 +163,11 @@ class ParallelDoOp : public framework::OperatorBase {
       auto &place = places[place_idx];
       auto *cur_scope = sub_scopes[place_idx];
 
-      workers.emplace_back(
-          framework::Async([program, cur_scope, place, block, place_idx] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(place_idx) + 1);
-            framework::Executor executor(place);
-            executor.Run(*program, cur_scope, block->ID(),
-                         false /*create_local_scope*/);
-          }));
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
     }
     for (auto &worker : workers) {
       worker.wait();
@@ -242,14 +238,11 @@ class ParallelDoGradOp : public framework::OperatorBase {
       auto *cur_scope = sub_scopes[i];
 
       // execute
-      workers.emplace_back(
-          framework::Async([program, cur_scope, place, block, i] {
-            // Give the thread an id to distinguish parallel block with same id.
-            platform::RecordThread rt(static_cast<int>(i) + 1);
-            framework::Executor executor(place);
-            executor.Run(*program, cur_scope, block->ID(),
-                         false /*create_local_scope*/);
-          }));
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
     }
     for (auto &worker : workers) {
       worker.wait();
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index be55bc43b1..31f083565f 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -81,7 +81,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn pool algorithm ---------------------
     auto handle = ctx.cuda_device_context().cudnn_handle();
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
+    CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
         cudnn_output_desc, output_data));
   }
@@ -154,7 +154,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
       T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
 
-      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
+      CUDNN_ENFORCE(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
           cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
           &beta, cudnn_input_desc, input_grad_data));
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 65fcce8bb0..a0d640b202 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase {
             .GetMutable<framework::ReaderHolder>();
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
+
+    // For profiling
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(dev_place);
+    platform::RecordEvent record_event(Type(), &ctx);
+
     reader->ReadNext(&ins);
     if (ins.empty()) {
       if (Attr<bool>("throw_eof_exp")) {
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 1ba6840149..4a6ce938a5 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase {
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
 
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a9fd1869c9..a1dfe39c3a 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
     AddAttr<std::vector<int>>(
         "shape", "(std::vector<int>) Target shape of reshape operator.");
-    AddAttr<bool>("inplace",
-                  "(default: false) Change the source tensor's shape without "
-                  "memory copy. When Attr(inplace) is set true, the output "
-                  "tensor shares memory with Input(X), otherwise, a new output "
-                  "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
@@ -233,16 +227,9 @@ class ReshapeKernel {
           "sequence_reshape op.");
     }
 
-    bool inplace = ctx.Attr<bool>("inplace");
+    out->mutable_data(ctx.GetPlace(), in->type());
+    framework::TensorCopySync(*in, ctx.GetPlace(), out);
     out->Resize(out_dims);
-    if (!inplace) {
-      out->mutable_data(ctx.GetPlace(), in->type());
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
   }
 };
 
@@ -251,19 +238,11 @@ class ReshapeGradKernel {
   void operator()(const framework::ExecutionContext &ctx) const {
     auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto in_dims = d_x->dims();
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    bool inplace = ctx.Attr<bool>("inplace");
-
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      ctx.device_context().Wait();
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
-    }
+    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
+    d_x->Resize(in_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index d7f8e994af..1866a86048 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -39,11 +39,6 @@ class SendBarrierOp : public framework::OperatorBase {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
     bool sync_mode = Attr<bool>("sync_mode");
 
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 829f310d42..3cd42f2d05 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -42,9 +42,6 @@ class SendOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
-
     distributed::RPCClient* rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index deab005149..dc26c53c64 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include "paddle/fluid/framework/ir/node.h"
 
 namespace paddle {
 namespace operators {
@@ -22,7 +23,10 @@ inline bool NeedSend(const framework::Scope& scope,
                      const std::string& varname) {
   // dummy variable is only used in parallel executor to represent
   // some dependency relationship, we don't need to send/recv it.
-  if (varname == "dummy") return false;
+  // TODO(paddle-dev): Why would parallel executor logic leaked into here?
+  if (varname.find(framework::ir::Node::kControlDepVarName) !=
+      std::string::npos)
+    return false;
   auto* var = scope.FindVar(varname);
   PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
                           varname);
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 5596fa0648..2bdb23e999 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -30,8 +30,16 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
+    auto dims = X->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_x;
+    framework::LoDTensor flattened_out;
+    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+
     math::SoftmaxCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(), X, Out);
+        context.template device_context<platform::CUDADeviceContext>(),
+        &flattened_x, &flattened_out);
   }
 };
 
@@ -46,9 +54,18 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
+    auto dims = Out->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_out;
+    framework::LoDTensor flattened_d_out;
+    framework::LoDTensor flattened_d_x;
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
+    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+
     math::SoftmaxGradCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(), Out,
-        dOut, dX);
+        context.template device_context<platform::CUDADeviceContext>(),
+        &flattened_out, &flattened_d_out, &flattened_d_x);
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index 6668e6b9e9..01819f53e3 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -26,9 +26,9 @@ using paddle::platform::MKLDNNMemDesc;
 
 using mkldnn::memory;  // Note: paddle has also "memory" namespace
 using mkldnn::primitive;
-using mkldnn::softmax_forward;
-using mkldnn::softmax_backward;
 using mkldnn::prop_kind;
+using mkldnn::softmax_backward;
+using mkldnn::softmax_forward;
 using mkldnn::stream;
 using platform::to_void_cast;
 
@@ -113,17 +113,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     auto mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* input = ctx.Input<Tensor>("X");
     Tensor* output = ctx.Output<Tensor>("Out");
-    PADDLE_ENFORCE(input->dims().size() == 2UL,
-                   "The input of softmax op must be a 2D matrix.");
-    const T* input_data = input->data<T>();
-    // allocate memory for output
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
-    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
-    // we will make normalization after final eg. axis: 1
-    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
-                   "Softmax input and output dimensions should match");
+    PADDLE_ENFORCE_EQ(
+        input->dims(), output->dims(),
+        "The shape of softmax's input and output must be identical.");
+
+    // make sure 'output' holds memory, which will be shared by
+    // 'flattened_output' later.
+    output->mutable_data<T>(ctx.GetPlace());
+
+    // flatten input and output to 2-D matrixs
+    auto dims = input->dims();  // input and output share the same shape
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::Tensor flattened_input;
+    framework::Tensor flattened_output;
+    flattened_input.ShareDataWith(*input).Resize(flattened_dims);
+    flattened_output.ShareDataWith(*output).Resize(flattened_dims);
+
+    const T* input_data = flattened_input.data<T>();
+    T* output_data = flattened_output.mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(flattened_dims);
+    std::vector<int> dst_tz = src_tz;
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Generate keys for storing/retriving primitives for this operator
@@ -174,23 +184,34 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     auto mkldnn_engine = dev_ctx.GetEngine();
     const Tensor* output = ctx.Input<Tensor>("Out");
-    const T* dst_data = output->data<T>();
-
     auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-    const auto* diff_dst_ptr = dout->template data<T>();
-
     auto* dx =
         ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-    T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
 
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    PADDLE_ENFORCE_EQ(
+        dout->dims(), dx->dims(),
+        "The shape of softmax_grad's input and output must be identical.");
+
+    // make sure 'dx' holds memory, which will be shared by 'flattened_dx'
+    // later.
+    dx->template mutable_data<T>(ctx.GetPlace());
+
+    auto dims = dout->dims();  // input and output share the same shape
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::Tensor flattened_output;
+    framework::Tensor flattened_dout;
+    framework::Tensor flattened_dx;
+    flattened_output.ShareDataWith(*output).Resize(flattened_dims);
+    flattened_dout.ShareDataWith(*dout).Resize(flattened_dims);
+    flattened_dx.ShareDataWith(*dx).Resize(flattened_dims);
+
+    const T* dst_data = flattened_output.data<T>();
+    const T* diff_dst_ptr = flattened_dout.template data<T>();
+    T* diff_src_ptr = flattened_dx.template mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(flattened_dims);
     std::vector<int> src_tz(dst_tz);
-    PADDLE_ENFORCE(output->dims().size() == 2UL,
-                   "The input of softmax op must be a 2D matrix.");
-    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
-    // we will make normalization after final eg. axis: 1
-    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
-                   "Softmax input and output dimensions should match");
+
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Currently only supports NC data format
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 31a7458f63..bb08123882 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -37,10 +37,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SoftmaxOp should not be null.");
 
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(x_dims.size() == 2UL,
-                   "The input of softmax op must be a matrix.");
-    ctx->SetOutputDim("Out", x_dims);
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -81,8 +78,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of softmax. "
-             "2-D with shape [batch_size, input_feature_dimensions].");
+             "The input tensor of softmax, "
+             "whose last dimension is the input_feature_dimensions.");
     AddOutput("Out", "The normalized values with the same shape as X.")
         .Reuse("X");
     AddAttr<bool>(
@@ -105,20 +102,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Softmax Operator.
 
-The input of the softmax operator is a 2-D tensor with shape N x K (N is the
-batch_size, K is the dimension of input feature). The output tensor has the
-same shape as the input tensor.
+The input of the softmax operator is a tensor of any rank. The output tensor 
+has the same shape as the input.
 
-For each row of the input tensor, the softmax operator squashes the
-K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-values in the range [0, 1] that add up to 1.
+The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+second dimension(row length) is as same as the last dimension of the input 
+tensor, and the first dimension(column length) is the product of all other 
+dimensions of the input tensor. For each row of the matrix, the softmax operator 
+squashes the K-dimensional(K is the width of the matrix, which is also the size 
+of the input tensor's last dimension) vector of arbitrary real values to a 
+K-dimensional vector of real values in the range [0, 1] that add up to 1.
 It computes the exponential of the given dimension and the sum of exponential
 values of all the other dimensions in the K-dimensional vector input.
 Then the ratio of the exponential of the given dimension and the sum of
 exponential values of all the other dimensions is the output of the softmax
 operator.
 
-For each row $i$ and each column $j$ in Input(X), we have:
+For each row $i$ and each column $j$ in the matrix, we have:
     $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
 
 )DOC");
@@ -137,7 +137,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
                       ctx->GetInputDim(framework::GradVarName("Out")),
                       "Input(Out) and its gradients should have a same shape.");
 
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("X"),
+                      ctx->GetInputDim(framework::GradVarName("Out")));
   }
 
  protected:
@@ -160,8 +161,8 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
-    auto input_data_type =
-        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    auto input_data_type = framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type());
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                      "float16 can only be used on GPU place");
@@ -172,13 +173,31 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("softmax_grad");
+
+    op->SetInput("Out", Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SoftmaxOpGradMaker);
 REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 600da45a0b..1205bd0587 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -31,8 +31,16 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
+    auto dims = X->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_x;
+    framework::LoDTensor flattened_out;
+    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+
     math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), X, Out);
+        context.template device_context<DeviceContext>(), &flattened_x,
+        &flattened_out);
   }
 };
 
@@ -47,8 +55,18 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
+    auto dims = Out->dims();
+    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    framework::LoDTensor flattened_out;
+    framework::LoDTensor flattened_d_out;
+    framework::LoDTensor flattened_d_x;
+    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
+    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
+    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
+
     math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), Out, dOut, dX);
+        context.template device_context<DeviceContext>(), &flattened_out,
+        &flattened_d_out, &flattened_d_x);
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 8f7840cee1..a559b01ed3 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
+#include <cub/cub.cuh>
+#include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 
 namespace paddle {
@@ -53,8 +55,196 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
     logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
   }
 }
+
 }  // namespace
 
+static __device__ __forceinline__ float real_exp(float x) { return expf(x); }
+static __device__ __forceinline__ double real_exp(double x) { return exp(x); }
+static __device__ __forceinline__ float real_log(float x) {
+  return math::TolerableValue<float>()(logf(x));
+}
+static __device__ __forceinline__ double real_log(double x) {
+  return math::TolerableValue<double>()(log(x));
+}
+
+/** In the following codes, 3 CUDA kernels are implemented to calculate softmax
+ * and loss **/
+/*
+  Supposing the x is `logits` and y is `labels`, the equations are as
+followings:
+
+  cross\_entropy_i = \sum_{j}[- y_i_j * log({e^{x_i_j}/\sum_{j}e^{x_i_j}})]
+        = \sum_{j}[- y_i_j * log({e^{x_i_j - max_i}/\sum_{j}e^{x_i_j-max_i}})]
+        = \sum_{j}[-y_i_j * (x_i_j - max_i - log\sum_{j}e^{x_i_j - max_i})]
+        = \sum_{j}[-y_i_j * (x_i_j - max_i - logDiffMaxSum_i)]
+        = \sum_{j}(-y_i_j * tmp_i_j)
+
+  softmax_i_j = e^{tmp_i_j}
+
+where:
+  max_i = \max_{j}{x_i_j}
+  logDiffMaxSum_i = log\sum_{j}e^{x_i_j - max_i}
+  tmp_i_j = x_i_j - max_i - logDiffMaxSum_i
+
+Therefore, the calculation can be separated into 3 steps:
+Step 1: row-wise operation to calculate max_i
+Step 2: row-wise operation to calculate logDiffMaxSum_i
+Step 3: caculate tmp_i_j, and finally get softmax_i_j and cross\_entropy_i
+
+To save memory, we can share memory among max_i, logDiffMaxSum_i and
+cross\_entropy_i.
+In this way, the 3 steps should be changed to:
+Step 1 (RowReductionForMax): row-wise operation to calculate max_i
+Step 2 (RowReductionForDiffMaxSum): calculate immediate result of softmax'_i_j =
+x_i_j - max_i, and row-wise operation to calculate logDiffMaxSum_i
+Step 3 (RowReductionForSoftmaxAndCrossEntropy): calculate tmp_i_j = softmax'_i_j
+- logDiffMaxSum_i, and finally get softmax_i_j and cross\_entropy_i
+*/
+
+// There are 3 kinds of reduce algorithms in cub:
+// BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
+// BLOCK_REDUCE_RAKING
+// BLOCK_REDUCE_WARP_REDUCTIONS (default)
+template <typename T, int BlockDim>
+using BlockReduce =
+    cub::BlockReduce<T, BlockDim /*, cub::BLOCK_REDUCE_WARP_REDUCTIONS*/>;
+
+template <typename T, int BlockDim>
+using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
+
+// Make sure that BlockDim <= feature_size
+// This kernel is used to calculate the max element of each row
+template <typename T, int BlockDim>
+__global__ void RowReductionForMax(const T* logits_data, T* max_data,
+                                   int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  T cur_max = logits_data[beg_idx];
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    if (cur_max < logits_data[beg_idx]) {
+      cur_max = logits_data[beg_idx];
+    }
+    beg_idx += BlockDim;
+  }
+
+  cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
+
+  if (threadIdx.x == 0) {
+    max_data[blockIdx.x] = cur_max < -64 ? -64 : cur_max;
+  }
+}
+
+// Make sure that BlockDim <= feature_size
+template <typename T, int BlockDim>
+__global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
+                                          T* softmax, int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  auto block_max = max_data[blockIdx.x];
+
+  softmax[beg_idx] = logits_data[beg_idx] - block_max;
+  T diff_max_sum = real_exp(softmax[beg_idx]);
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    softmax[beg_idx] = logits_data[beg_idx] - block_max;
+    diff_max_sum += real_exp(softmax[beg_idx]);
+    beg_idx += BlockDim;
+  }
+
+  diff_max_sum =
+      BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
+  if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum);
+}
+
+// Make sure that BlockDim <= feature_size
+template <typename T, int BlockDim>
+__global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
+                                                      const T* labels_data,
+                                                      T* loss_data, T* softmax,
+                                                      int feature_size) {
+  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
+
+  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  auto end_idx = feature_size * (blockIdx.x + 1);
+
+  // log_diff_max_sum shares memory with loss
+  auto block_log_diff_max_sum = loss_data[blockIdx.x];
+  auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
+  softmax[beg_idx] = real_exp(tmp);
+  auto loss = -labels_data[beg_idx] * tmp;
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    tmp = softmax[beg_idx] - block_log_diff_max_sum;
+    softmax[beg_idx] = real_exp(tmp);
+    loss -= (labels_data[beg_idx] * tmp);
+    beg_idx += BlockDim;
+  }
+
+  loss = BlockReduce<T, BlockDim>(temp_storage).Reduce(loss, cub::Sum());
+  if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
+}
+
+template <typename T>
+__global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, int batch_size) {
+  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < batch_size) out[idx] = static_cast<T>(1);
+}
+
+template <typename T>
+static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
+                                               const T* labels_data,
+                                               T* softmax_data, T* loss_data,
+                                               int batch_size, int feature_size,
+                                               cudaStream_t stream) {
+  constexpr int kMaxBlockDim = 512;
+  int block_dim = feature_size >= kMaxBlockDim
+                      ? kMaxBlockDim
+                      : (1 << static_cast<int>(std::log2(feature_size)));
+
+#define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                \
+  case BlockDim:                                                              \
+    RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(     \
+        logits_data, loss_data, feature_size);                                \
+    RowReductionForDiffMaxSum<T,                                              \
+                              BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
+        logits_data, loss_data, softmax_data, feature_size);                  \
+    RowReductionForSoftmaxAndCrossEntropy<                                    \
+        T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(                    \
+        logits_data, labels_data, loss_data, softmax_data, feature_size);     \
+    break
+
+  switch (block_dim) {
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
+    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
+    case 1:
+      SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) /
+                                                kMaxBlockDim,
+                                            kMaxBlockDim, 0, stream>>>(
+          softmax_data, batch_size);
+      cudaMemsetAsync(loss_data, 0, batch_size, stream);
+      break;
+    default:
+      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      break;
+  }
+
+#undef CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
+}
+
 template <typename T>
 class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -66,14 +256,24 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     Tensor* softmax = context.Output<Tensor>("Softmax");
 
     Tensor* loss = context.Output<Tensor>("Loss");
-    softmax->mutable_data<T>(context.GetPlace());
-    loss->mutable_data<T>(context.GetPlace());
-
-    math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
-        context.cuda_device_context(), logits, softmax);
-    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-        context.cuda_device_context(), loss, softmax, labels,
-        context.Attr<bool>("soft_label"));
+    auto* softmax_data = softmax->mutable_data<T>(context.GetPlace());
+    auto* loss_data = loss->mutable_data<T>(context.GetPlace());
+
+    auto soft_label = context.Attr<bool>("soft_label");
+    if (soft_label) {
+      int batch_size = logits->dims()[0];
+      int feature_size = logits->dims()[1];
+      auto* logits_data = logits->data<T>();
+      auto* labels_data = labels->data<T>();
+      SoftmaxWithCrossEntropyFusedKernel(
+          logits_data, labels_data, softmax_data, loss_data, batch_size,
+          feature_size, context.cuda_device_context().stream());
+    } else {
+      math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
+                                     softmax);
+      math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+          context.cuda_device_context(), loss, softmax, labels, false);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index d263426e07..c4af5a65fc 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       const auto &ids_rows = ids_selected_rows->rows();
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
       const size_t shard_num = outs.size();
+      for (auto &out : outs) {
+        out->mutable_rows()->clear();
+      }
       // get rows for outputs
-      for (auto &id : ids_rows) {
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        outs[shard_id]->mutable_rows()->push_back(id);
+      std::unordered_map<int64_t, size_t> id_to_index;
+      for (size_t i = 0; i < ids_rows.size(); ++i) {
+        id_to_index[ids_rows[i]] = i;
+        size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;
+        outs[shard_id]->mutable_rows()->push_back(ids_rows[i]);
       }
 
       int64_t row_width = ids_dims[1];
@@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
             {static_cast<int64_t>(out->rows().size()), row_width});
         T *output = out->mutable_value()->mutable_data<T>(ddim, place);
         for (int64_t i = 0; i < ddim[0]; ++i) {
-          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
+          memcpy(output + i * row_width,
+                 ids + id_to_index[out->rows()[i]] * row_width,
                  row_width * sizeof(T));
         }
       }
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index db641a4bc2..ee3078876c 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -55,18 +55,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
                     "TensorRT' tensor input requires at least 2 dimensions");
   PADDLE_ENFORCE_LE(shape.size(), 4UL,
                     "TensorRT' tensor input requires at most 4 dimensions");
-
-  switch (shape.size()) {
-    case 2:
-      return nvinfer1::Dims2(1, shape[1]);
-    case 3:
-      return nvinfer1::Dims3(1, shape[1], shape[2]);
-    case 4:
-      return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]);
-    default:
-      return nvinfer1::Dims();
-  }
-  return nvinfer1::Dims();
+  PADDLE_ENFORCE_EQ(shape.size(), 4UL);
+  return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
 }
 
 }  // namespace
@@ -86,6 +76,9 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
     parameters.insert(param);
   }
 
+  std::vector<std::string> output_maps =
+      context.Attr<std::vector<std::string>>("output_name_mapping");
+
   // TODO(Superjomn) replace this with a different stream
   auto *engine = Singleton<TRT_EngineManager>::Global().Create(
       max_batch, max_workspace, nullptr /*engine hold its own stream*/,
@@ -97,6 +90,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
   // Add inputs
   VLOG(4) << "declare inputs";
   for (auto &input : context.Inputs("Xs")) {
+    if (parameters.count(input)) continue;
     VLOG(4) << "declare input " << input;
     auto *var = block.FindVar(input);
     // TensorRT engine need to create parameters. The parameter's description
@@ -122,7 +116,7 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
       block_desc, parameters, context.scope(), engine);
 
   // Add outputs
-  for (auto &output : context.Outputs("Ys")) {
+  for (auto &output : output_maps) {
     engine->DeclareOutput(output);
   }
 
@@ -163,7 +157,4 @@ REGISTER_OP_CPU_KERNEL(
     ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
 
-// A trick to compile with the needed TensorRT op converter.
-USE_TRT_CONVERTER(mul)
-
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 32d10fd8a5..2cbe1213a2 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -66,8 +66,17 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
                       context.Attr<int>("max_batch"));
 
+    std::vector<std::string> output_maps =
+        context.Attr<std::vector<std::string>>("output_name_mapping");
+
+    auto params = context.Attr<std::vector<std::string>>("parameters");
+    std::unordered_set<std::string> parameters;
+    for (const auto& param : params) {
+      parameters.insert(param);
+    }
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
+      if (parameters.count(x)) continue;
       // convert input and copy to TRT engine's buffer
       auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
           context.scope(), x);
@@ -82,10 +91,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     // Execute the engine.
     PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
     engine->Execute(FLAGS_tensorrt_engine_batch_size);
+
     // Convert output tensor from engine to fluid
+    int output_index = 0;
     for (const auto& y : context.Outputs("Ys")) {
       // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@@ -102,7 +113,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       // TODO(Superjomn) change this float to dtype size.
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
                   FLAGS_tensorrt_engine_batch_size;
-      engine->GetOutputInCPU(y,
+      engine->GetOutputInCPU(output_maps[output_index],
                              fluid_t->mutable_data<float>(platform::CPUPlace()),
                              size * sizeof(float));
       //} else {
@@ -110,6 +121,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
       // size * sizeof(float));
       //}
+      output_index += 1;
     }
 
     cudaStreamSynchronize(*engine->stream());
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 7cb1e47a15..37657fa0b0 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -103,6 +103,9 @@ TEST(TensorRTEngineOp, manual) {
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                                    "output_name_mapping",
+                                    std::vector<std::string>({"z0"}));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
@@ -196,6 +199,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
       std::vector<std::string>({"y0", "y1", "y2", "y3"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
 
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(),
+                                    "output_name_mapping",
+                                    std::vector<std::string>({"z3"}));
+
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
 
   // Execute them.
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index e0d7937ae2..f08c0e8e34 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -18,7 +18,11 @@ else()
 endif()
 cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
 
-cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce)
+set(CPU_INFO_DEPS gflags glog enforce)
+IF(WITH_XBYAK)
+    list(APPEND CPU_INFO_DEPS xbyak)
+ENDIF()
+cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
@@ -60,3 +64,7 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
+
+IF(WITH_GPU)
+  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
+ENDIF()
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 77ecb17011..234a04b5c2 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_MKLML
+#include <omp.h>
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
@@ -33,6 +34,7 @@ void SetNumThreads(int num_threads) {
 #elif defined(PADDLE_WITH_MKLML)
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+  omp_set_num_threads(num_threads);
 #else
   PADDLE_ENFORCE(false, "To be implemented.");
 #endif
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index f832d72b53..7d53a684d6 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_info.h"
 
+#ifdef PADDLE_WITH_XBYAK
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+#endif
+
 #ifdef __APPLE__
 #include <sys/sysctl.h>
 #include <sys/types.h>
@@ -98,5 +103,39 @@ size_t CUDAPinnedMaxChunkSize() {
   return CUDAPinnedMaxAllocSize() / 256;
 }
 
+#ifdef PADDLE_WITH_XBYAK
+namespace jit {
+
+static Xbyak::util::Cpu cpu;
+bool MayIUse(const cpu_isa_t cpu_isa) {
+  using namespace Xbyak::util;  // NOLINT
+  switch (cpu_isa) {
+    case sse42:
+      return cpu.has(Cpu::tSSE42);
+    case avx2:
+      return cpu.has(Cpu::tAVX2);
+    case avx512_common:
+      return cpu.has(Cpu::tAVX512F);
+    case avx512_core:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
+             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ);
+    case avx512_core_vnni:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
+             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) &&
+             cpu.has(Cpu::tAVX512_VNNI);
+    case avx512_mic:
+      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512CD) &&
+             cpu.has(Cpu::tAVX512ER) && cpu.has(Cpu::tAVX512PF);
+    case avx512_mic_4ops:
+      return true && MayIUse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) &&
+             cpu.has(Cpu::tAVX512_4VNNIW);
+    case isa_any:
+      return true;
+  }
+  return false;
+}
+
+}  // namespace jit
+#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index f06c2b67fe..f5f6766759 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -37,5 +37,25 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();
 
+#ifdef PADDLE_WITH_XBYAK
+namespace jit {
+
+typedef enum {
+  isa_any,
+  sse42,
+  avx2,
+  avx512_common,
+  avx512_core,
+  avx512_core_vnni,
+  avx512_mic,
+  avx512_mic_4ops,
+} cpu_isa_t;  // Instruction set architecture
+
+// May I use some instruction
+inline bool MayIUse(const cpu_isa_t cpu_isa);
+
+}  // namespace jit
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index ecec4178f2..23457ff5fe 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <cuda.h>
+// NOTE(): support float16 to half in header file.
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
@@ -36,6 +40,18 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 #endif
 }
 
+// CUDA 9.0 have native compatible float16 shfl_down
+#if CUDA_VERSION < 9000
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  half tmp = static_cast<half>(val);
+  __shfl_down(tmp, static_cast<unsigned>(delta), width);
+  return float16(tmp);
+}
+#endif
+
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
@@ -46,6 +62,11 @@ __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
 #endif
 }
 
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
   // NOTE(zcd): The warp size should be taken from the
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
new file mode 100644
index 0000000000..ca5ca1caeb
--- /dev/null
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -0,0 +1,153 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <random>
+
+#define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using paddle::platform::float16;
+
+template <typename T>
+__global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
+  }
+}
+
+template <typename T>
+struct AddFunctor {
+  T operator()(const T& a, const T& b) { return a + b; }
+};
+
+template <typename T>
+void TestCase(size_t num) {
+  T *in1, *in2, *out;
+  T *d_in1, *d_in2;
+  size_t size = sizeof(T) * num;
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<T*>(malloc(size));
+  in2 = reinterpret_cast<T*>(malloc(size));
+  out = reinterpret_cast<T*>(malloc(size));
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num; ++i) {
+    in1[i] = static_cast<T>(dist(engine));
+    in2[i] = static_cast<T>(dist(engine));
+  }
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
+  AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<T>()(in1[i], in2[i])), 0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+// cuda primitives
+TEST(CudaAtomic, Add) {
+  TestCase<float>(static_cast<size_t>(10));
+  TestCase<float>(static_cast<size_t>(1024 * 1024));
+
+  TestCase<double>(static_cast<size_t>(10));
+  TestCase<double>(static_cast<size_t>(1024 * 1024));
+}
+
+TEST(CudaAtomic, float16) {
+  TestCase<float16>(static_cast<size_t>(1));
+  TestCase<float16>(static_cast<size_t>(2));
+  TestCase<float16>(static_cast<size_t>(3));
+
+  TestCase<float16>(static_cast<size_t>(10));
+  TestCase<float16>(static_cast<size_t>(1024 * 1024));
+}
+
+// unalignment of uint8
+void TestUnalign(size_t num, const int shift_bit) {
+  PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2");
+  float16 *in1, *in2, *out;
+  float16 *d_in1, *d_in2;
+  size_t size = sizeof(uint8_t) * (num + shift_bit);
+  size_t array_size = sizeof(float16) * (num / 2);
+
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+  in1 = reinterpret_cast<float16*>(malloc(size));
+  in2 = reinterpret_cast<float16*>(malloc(size));
+  out = reinterpret_cast<float16*>(malloc(size));
+
+  // right shift 1, mimic the unalignment of address
+  float16* r_in1 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in1) + shift_bit);
+  float16* r_in2 =
+      reinterpret_cast<float16*>(reinterpret_cast<uint8_t*>(in2) + shift_bit);
+
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num / 2; ++i) {
+    r_in1[i] = static_cast<float16>(dist(engine));
+    r_in2[i] = static_cast<float16>(dist(engine));
+  }
+  cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
+  AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
+  cudaDeviceSynchronize();
+  cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  for (size_t i = 0; i < num / 2; ++i) {
+    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // so we use EXPECT_NEAR to check the result.
+    EXPECT_NEAR(static_cast<float>(out[i]),
+                static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
+                0.001);
+  }
+  free(in1);
+  free(in2);
+  free(out);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+TEST(CudaAtomic, float16Unalign) {
+  // same with float16 testcase
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 2);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 2);
+
+  // shift the address.
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 1);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 1);
+
+  TestUnalign(static_cast<size_t>(2), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
+  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
+}
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index d535ed2f89..67ea64833d 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #pragma once
 #include <cuda.h>
+#include <stdio.h>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
 
 #define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
+  __device__ __forceinline__ T CudaAtomic##op(T *address, const T val)
 
 #define USE_CUDA_ATOMIC(op, T) \
   CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
@@ -42,17 +44,17 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
   static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                 "long long should be int64");
   return CudaAtomicAdd(
-      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
-      static_cast<unsigned long long int>(val));           // NOLINT
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =                 // NOLINT
-      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;   // NOLINT
+  unsigned long long int *address_as_ull =                  // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
 
   do {
     assumed = old;
@@ -64,6 +66,67 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 
   return __longlong_as_double(old);
 }
+#endif
+
+#ifdef PADDLE_CUDA_FP16
+// NOTE(dzhwinter): cuda do not have atomicCAS for half.
+// Just use the half address as a unsigned value address and
+// do the atomicCAS. According to the value store at high 16 bits
+// or low 16 bits, then do a different sum and CAS.
+// Given most warp-threads will failed on the atomicCAS, so this
+// implemented should be avoided in high concurrency. It's will be
+// slower than the way convert value into 32bits and do a full atomicCAS.
+
+// convert the value into float and do the add arithmetic.
+// then store the result into a uint32.
+inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
+  float16 low_half;
+  // the float16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<float16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
+  float16 high_half;
+  // the float16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<float16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  // concrete packed float16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the float16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the float16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+
 #endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 6ea4f8b7cb..bb8b14bb9f 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -59,13 +59,12 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 #define CUDNN_VERSION_MIN(major, minor, patch) \
   (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
 
-#define CUDNN_ENFORCE(condition)                                  \
-  do {                                                            \
-    cudnnStatus_t status = condition;                             \
-    if (status != CUDNN_STATUS_SUCCESS) {                         \
-      VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \
-      PADDLE_THROW("cuDNN call failed");                          \
-    }                                                             \
+#define CUDNN_ENFORCE(condition)                                     \
+  do {                                                               \
+    cudnnStatus_t status = condition;                                \
+    if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) {                  \
+      PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
+    }                                                                \
   } while (false)
 
 enum class DataLayout {  // Not use
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index d9e2afadaf..dc1d751141 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -30,9 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace {
-// Current thread's id. Note, we don't distinguish nested threads
-// for now.
-thread_local int cur_thread_id = 0;
 // Tracking the nested block stacks of each thread.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
@@ -192,6 +189,8 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
 }
 }  // namespace
 
+#endif  // PADDLE_WITH_CUPTI
+
 class DeviceTracerImpl : public DeviceTracer {
  public:
   DeviceTracerImpl() : enabled_(false) {}
@@ -247,6 +246,8 @@ class DeviceTracerImpl : public DeviceTracer {
     if (enabled_) {
       return;
     }
+
+#ifdef PADDLE_WITH_CUPTI
     EnableActivity();
 
     // Register callbacks for buffer requests and completed by CUPTI.
@@ -265,6 +266,7 @@ class DeviceTracerImpl : public DeviceTracer {
         dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
                                      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
     CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
+#endif  // PADDLE_WITH_CUPTI
     enabled_ = true;
   }
 
@@ -316,16 +318,21 @@ class DeviceTracerImpl : public DeviceTracer {
   }
 
   void Disable() {
+#ifdef PADDLE_WITH_CUPTI
     // flush might cause additional calls to DeviceTracker.
     dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
+#endif  // PADDLE_WITH_CUPTI
     std::lock_guard<std::mutex> l(trace_mu_);
+#ifdef PADDLE_WITH_CUPTI
     DisableActivity();
     dynload::cuptiUnsubscribe(subscriber_);
     CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
+#endif  // PADDLE_WITH_CUPTI
     enabled_ = false;
   }
 
  private:
+#ifdef PADDLE_WITH_CUPTI
   static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid, const void *cbdata) {
     auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
@@ -343,7 +350,8 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
     }
   }
-
+  CUpti_SubscriberHandle subscriber_;
+#endif  // PADDLE_WITH_CUPTI
   std::mutex trace_mu_;
   bool enabled_;
   uint64_t start_ns_;
@@ -352,45 +360,9 @@ class DeviceTracerImpl : public DeviceTracer {
   std::vector<MemRecord> mem_records_;
   std::vector<CPURecord> cpu_records_;
   std::unordered_map<uint32_t, std::string> correlations_;
-  CUpti_SubscriberHandle subscriber_;
-};
-
-#endif  // PADDLE_WITH_CUPTI
-
-class DeviceTracerDummy : public DeviceTracer {
- public:
-  DeviceTracerDummy() {}
-
-  void AddAnnotation(uint64_t id, const std::string &anno) {}
-
-  void AddCPURecords(const std::string &anno, uint64_t start_ns,
-                     uint64_t end_ns, int64_t device_id, int64_t thread_id) {}
-
-  void AddMemRecords(const std::string &name, uint64_t start_ns,
-                     uint64_t end_ns, int64_t device_id, int64_t stream_id,
-                     uint32_t correlation_id, uint64_t bytes) {}
-
-  void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
-                        int64_t stream_id, uint32_t correlation_id) {}
-
-  bool IsEnabled() { return false; }
-
-  void Enable() {}
-
-  proto::Profile GenProfile(const std::string &profile_path) {
-    return proto::Profile();
-  }
-
-  void Disable() {}
 };
 
-void CreateTracer(DeviceTracer **t) {
-#ifdef PADDLE_WITH_CUPTI
-  *t = new DeviceTracerImpl();
-#else
-  *t = new DeviceTracerDummy();
-#endif  // PADDLE_WITH_CUPTI
-}
+void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
 
 DeviceTracer *GetDeviceTracer() {
   std::call_once(tracer_once_flag, CreateTracer, &tracer);
@@ -413,12 +385,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
 void ClearCurBlock() { block_id_stack.pop_back(); }
 
 int BlockDepth() { return block_id_stack.size(); }
-
-void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
-
-void ClearCurThread() { cur_thread_id = 0; }
-
-int CurThread() { return cur_thread_id; }
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 0375c7439c..322996fb4f 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <sys/time.h>
+#include <time.h>
+#include <chrono>  // NOLINT
 #include <string>
 
 #include "paddle/fluid/platform/dynload/cupti.h"
@@ -25,6 +28,12 @@ namespace platform {
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
 
+inline uint64_t PosixInNsec() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
+}
+
 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
 // 2. Collect cuda statistics: start/end ts, memory, etc.
@@ -99,9 +108,5 @@ std::string CurAnnotation();
 void SetCurBlock(int block_id);
 void ClearCurBlock();
 int BlockDepth();
-
-void SetCurThread(int thread_id);
-void ClearCurThread();
-int CurThread();
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index ffd183af68..efb021c838 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -67,8 +67,11 @@ struct float16;
 }  // namespace platform
 }  // namespace paddle
 
+// NOTE():
+// Do not move the eigen.h header, otherwise the eigen_vector<bool> will failed.
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace platform {
@@ -898,6 +901,30 @@ struct is_pod<paddle::platform::float16> {
       is_standard_layout<paddle::platform::float16>::value;
 };
 
+template <>
+struct is_floating_point<paddle::platform::float16>
+    : std::integral_constant<
+          bool, std::is_same<paddle::platform::float16,
+                             typename std::remove_cv<
+                                 paddle::platform::float16>::type>::value> {};
+template <>
+struct is_signed<paddle::platform::float16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<paddle::platform::float16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const paddle::platform::float16& a) {
+  return paddle::platform::isnan(a);
+}
+
+inline bool isinf(const paddle::platform::float16& a) {
+  return paddle::platform::isinf(a);
+}
+
 template <>
 struct numeric_limits<paddle::platform::float16> {
   static const bool is_specialized = true;
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index ede294be1e..27e930e6e0 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -141,10 +141,36 @@ TEST(float16, lod_tensor_cpu) {
   }
 }
 
+TEST(float16, floating) {
+  // compile time assert.
+  PADDLE_ASSERT(std::is_floating_point<float16>::value);
+}
+
 TEST(float16, print) {
   float16 a = float16(1.0f);
   std::cout << a << std::endl;
 }
 
+// CPU test
+TEST(float16, isinf) {
+  float16 a;
+  a.x = 0x7c00;
+  float16 b = float16(INFINITY);
+  float16 c = static_cast<float16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(float16, isnan) {
+  float16 a;
+  a.x = 0x7fff;
+  float16 b = float16(NAN);
+  float16 c = static_cast<float16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 1b9cf9b5d3..e2b7ca9b03 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -11,11 +11,13 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
 
+#include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <bitset>
+#include <iostream>
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/legacy/utils/Logging.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
   __global__ void op_type(const half* in1, const half* in2, half* out) { \
@@ -241,6 +243,72 @@ TEST(float16, lod_tensor_on_gpu) {
   }
 }
 
+template <typename T>
+struct Functor {
+  bool operator()(const T& val) {
+    return std::type_index(typeid(T)) ==
+           std::type_index(typeid(platform::float16));
+  }
+};
+
+TEST(float16, typeid) {
+  // the framework heavily used typeid hash
+  Functor<float16> functor;
+  float16 a = float16(.0f);
+  Functor<int> functor2;
+  int b(0);
+
+  // compile time assert
+  PADDLE_ASSERT(functor(a) == true);
+  PADDLE_ASSERT(functor2(b) == false);
+}
+
+// GPU test
+TEST(float16, isinf) {
+  float16 a;
+  a.x = 0x7c00;
+  float16 b = float16(INFINITY);
+  // underflow to 0
+  float16 native_a(5e-40f);
+  // overflow to inf
+  float16 native_b(5e40f);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(native_b), true);
+  EXPECT_EQ(native_a, float16(0));
+}
+
+TEST(float16, isnan) {
+  float16 a;
+  a.x = 0x7fff;
+  float16 b = float16(NAN);
+  float16 c = float16(5e40);
+  // inf * +-0 will get a nan
+  float16 d = c * float16(0);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(d), true);
+}
+
+TEST(float16, cast) {
+  float16 a;
+  a.x = 0x0070;
+  auto b = a;
+  {
+    // change semantic, keep the same value
+    float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b));
+    EXPECT_EQ(b, c);
+  }
+
+  {
+    // use uint32 low 16 bit store float16
+    uint32_t c = reinterpret_cast<uint32_t&>(b);
+    float16 d;
+    d.x = c;
+    EXPECT_EQ(b, d);
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
 #endif  // PADDLE_CUDA_FP16
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0b77652841..6f1f0c4796 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -23,6 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
+DEFINE_int32(paddle_num_threads, 1,
+             "Number of threads for each paddle instance.");
+
 namespace paddle {
 namespace framework {
 
@@ -115,7 +118,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
 #ifndef PADDLE_WITH_MKLDNN
-  platform::SetNumThreads(1);
+  platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
 }
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index a8f93e6848..10a3ad256b 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -223,7 +223,7 @@ class MKLDNNHandler {
   static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
                              const std::string& suffix) {
     return dims2str(operand_dims) + suffix;
-  };
+  }
 
  protected:
   static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
@@ -251,5 +251,17 @@ inline mkldnn::memory::format MKLDNNFormatForSize(
   return data_format;
 }
 
+inline mkldnn::memory::format data_format_to_memory_format(
+    const std::string& data_format) {
+  switch (framework::StringToDataLayout(data_format)) {
+    case framework::DataLayout::kNHWC:
+      return mkldnn::memory::format::nhwc;
+    case framework::DataLayout::kNCHW:
+      return mkldnn::memory::format::nchw;
+    default:
+      return mkldnn::memory::format::any;
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 01de9d7041..d0286719b9 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 #include <sys/time.h>
-#include <time.h>
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -97,12 +96,6 @@ inline uint64_t GetTimeInNsec() {
       .count();
 }
 
-inline uint64_t PosixInNsec() {
-  struct timeval tv;
-  gettimeofday(&tv, nullptr);
-  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
-}
-
 Event::Event(EventType type, std::string name, uint32_t thread_id,
              const DeviceContext* dev_ctx)
     : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
@@ -110,6 +103,8 @@ Event::Event(EventType type, std::string name, uint32_t thread_id,
   has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
   if (has_cuda_) {
     auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+    PADDLE_ENFORCE(cudaSetDevice(
+        boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
     PADDLE_ENFORCE(cudaGetDevice(&device_));
     PADDLE_ENFORCE(cudaEventCreate(&event_));
     auto stream = cuda_dev_ctx->stream();
@@ -176,6 +171,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
 
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   dev_ctx_ = dev_ctx;
@@ -186,11 +182,12 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
 }
 
 RecordEvent::~RecordEvent() {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
-                          BlockDepth(), CurThread());
+                          BlockDepth(), g_thread_id);
   }
   ClearCurAnnotation();
   PopEvent(name_, dev_ctx_);
@@ -198,6 +195,7 @@ RecordEvent::~RecordEvent() {
 
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   is_enabled_ = true;
   SetCurBlock(block_id);
@@ -205,27 +203,18 @@ RecordBlock::RecordBlock(int block_id)
 }
 
 RecordBlock::~RecordBlock() {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     // We try to put all blocks at the same nested depth in the
     // same timeline lane. and distinguish the using thread_id.
     tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
-                          CurThread());
+                          g_thread_id);
   }
   ClearCurBlock();
 }
 
-RecordThread::RecordThread(int thread_id) {
-  if (g_state == ProfilerState::kDisabled) return;
-  SetCurThread(thread_id);
-}
-
-RecordThread::~RecordThread() {
-  if (g_state == ProfilerState::kDisabled) return;
-  ClearCurThread();
-}
-
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index bf43925373..c99d9c807d 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -95,11 +95,6 @@ struct RecordBlock {
   uint64_t start_ns_;
 };
 
-struct RecordThread {
-  explicit RecordThread(int thread_id);
-  ~RecordThread();
-};
-
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index ee1c8d46dd..2320f3e4db 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -394,8 +394,10 @@ All parameter, weight, gradient are variables in Paddle.
     InferenceOptimize(*(origin.Proto()), &pruned_desc);
     return new ProgramDesc(pruned_desc);
   });
-  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
-  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
+  m.def("empty_var_name",
+        []() { return std::string(framework::kEmptyVarName); });
+  m.def("grad_var_suffix",
+        []() { return std::string(framework::kGradVarSuffix); });
   m.def_submodule(
        "var_names",
        "The module will return special predefined variable name in Paddle")
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9e58a39eb0..8460f93b84 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -419,6 +419,25 @@ EOF
     linkchecker doc/v2/en/html/index.html
     linkchecker doc/v2/cn/html/index.html
     linkchecker doc/v2/api/en/html/index.html
+
+    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+
+    # Deploy to the the content server if its a "develop" or "release/version" branch
+    # The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
+    if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
+        PPO_SCRIPT_BRANCH=develop
+    elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
+        PPO_SCRIPT_BRANCH=master
+    else
+        # Early exit, this branch doesn't require documentation build
+        return 0;
+    fi
+     # Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch
+    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
+    export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python
+    cd ..
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH}
+    cd -
 }
 
 function gen_html() {
@@ -534,7 +553,7 @@ EOF
         make -j `nproc` inference_lib_dist
         cd ${PADDLE_ROOT}/build
         cp -r fluid_install_dir fluid
-        tar -cf fluid.tgz fluid
+        tar -czf fluid.tgz fluid
       fi
 }
 
@@ -547,6 +566,7 @@ function test_fluid_inference_lib() {
 EOF
         cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
         ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF}
+        ./clean.sh
       fi
 }
 
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 3462deb9c2..174c2a12f0 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -52,6 +52,9 @@ EOL
     ${DOCKER_CMD} run -it \
         ${DOCKER_ENV} \
         -e SCRIPT_NAME=$0 \
+        -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
+        -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
+        -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
         -v $PADDLE_ROOT:/paddle \
         -v ${HOME}/.ccache:/root/.ccache \
         -w /paddle \
diff --git a/patches/grpc/completion_queue.h b/patches/grpc/completion_queue.h
new file mode 100644
index 0000000000..6e92c60ea2
--- /dev/null
+++ b/patches/grpc/completion_queue.h
@@ -0,0 +1,386 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/// A completion queue implements a concurrent producer-consumer queue, with
+/// two main API-exposed methods: \a Next and \a AsyncNext. These
+/// methods are the essential component of the gRPC C++ asynchronous API.
+/// There is also a \a Shutdown method to indicate that a given completion queue
+/// will no longer have regular events. This must be called before the
+/// completion queue is destroyed.
+/// All completion queue APIs are thread-safe and may be used concurrently with
+/// any other completion queue API invocation; it is acceptable to have
+/// multiple threads calling \a Next or \a AsyncNext on the same or different
+/// completion queues, or to call these methods concurrently with a \a Shutdown
+/// elsewhere.
+/// \remark{All other API calls on completion queue should be completed before
+/// a completion queue destructor is called.}
+#ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+#define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+
+#include <typeinfo>
+
+#include <grpc/impl/codegen/atm.h>
+#include <grpcpp/impl/codegen/completion_queue_tag.h>
+#include <grpcpp/impl/codegen/core_codegen_interface.h>
+#include <grpcpp/impl/codegen/grpc_library.h>
+#include <grpcpp/impl/codegen/status.h>
+#include <grpcpp/impl/codegen/time.h>
+
+struct grpc_completion_queue;
+
+namespace grpc {
+
+template <class R>
+class ClientReader;
+template <class W>
+class ClientWriter;
+template <class W, class R>
+class ClientReaderWriter;
+template <class R>
+class ServerReader;
+template <class W>
+class ServerWriter;
+namespace internal {
+template <class W, class R>
+class ServerReaderWriterBody;
+}  // namespace internal
+
+class Channel;
+class ChannelInterface;
+class ClientContext;
+class CompletionQueue;
+class Server;
+class ServerBuilder;
+class ServerContext;
+class ServerInterface;
+
+namespace internal {
+class CompletionQueueTag;
+class RpcMethod;
+template <class ServiceType, class RequestType, class ResponseType>
+class RpcMethodHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class ClientStreamingHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class ServerStreamingHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class BidiStreamingHandler;
+class UnknownMethodHandler;
+template <class Streamer, bool WriteNeeded>
+class TemplatedBidiStreamingHandler;
+template <class InputMessage, class OutputMessage>
+class BlockingUnaryCallImpl;
+}  // namespace internal
+
+extern CoreCodegenInterface* g_core_codegen_interface;
+
+/// A thin wrapper around \ref grpc_completion_queue (see \ref
+/// src/core/lib/surface/completion_queue.h).
+/// See \ref doc/cpp/perf_notes.md for notes on best practices for high
+/// performance servers.
+class CompletionQueue : private GrpcLibraryCodegen {
+ public:
+  /// Default constructor. Implicitly creates a \a grpc_completion_queue
+  /// instance.
+  CompletionQueue()
+      : CompletionQueue(grpc_completion_queue_attributes{
+            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, GRPC_CQ_DEFAULT_POLLING}) {}
+
+  /// Wrap \a take, taking ownership of the instance.
+  ///
+  /// \param take The completion queue instance to wrap. Ownership is taken.
+  explicit CompletionQueue(grpc_completion_queue* take);
+
+  /// Destructor. Destroys the owned wrapped completion queue / instance.
+  ~CompletionQueue() {
+    if (typeid(*g_core_codegen_interface).hash_code() !=
+        typeid(CoreCodegenInterface).hash_code()) {
+      g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
+    }
+  }
+
+  /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
+  enum NextStatus {
+    SHUTDOWN,   ///< The completion queue has been shutdown and fully-drained
+    GOT_EVENT,  ///< Got a new event; \a tag will be filled in with its
+                ///< associated value; \a ok indicating its success.
+    TIMEOUT     ///< deadline was reached.
+  };
+
+  /// Read from the queue, blocking until an event is available or the queue is
+  /// shutting down.
+  ///
+  /// \param tag[out] Updated to point to the read event's tag.
+  /// \param ok[out] true if read a successful event, false otherwise.
+  ///
+  /// Note that each tag sent to the completion queue (through RPC operations
+  /// or alarms) will be delivered out of the completion queue by a call to
+  /// Next (or a related method), regardless of whether the operation succeeded
+  /// or not. Success here means that this operation completed in the normal
+  /// valid manner.
+  ///
+  /// Server-side RPC request: \a ok indicates that the RPC has indeed
+  /// been started. If it is false, the server has been Shutdown
+  /// before this particular call got matched to an incoming RPC.
+  ///
+  /// Client-side StartCall/RPC invocation: \a ok indicates that the RPC is
+  /// going to go to the wire. If it is false, it not going to the wire. This
+  /// would happen if the channel is either permanently broken or
+  /// transiently broken but with the fail-fast option. (Note that async unary
+  /// RPCs don't post a CQ tag at this point, nor do client-streaming
+  /// or bidi-streaming RPCs that have the initial metadata corked option set.)
+  ///
+  /// Client-side Write, Client-side WritesDone, Server-side Write,
+  /// Server-side Finish, Server-side SendInitialMetadata (which is
+  /// typically included in Write or Finish when not done explicitly):
+  /// \a ok means that the data/metadata/status/etc is going to go to the
+  /// wire. If it is false, it not going to the wire because the call
+  /// is already dead (i.e., canceled, deadline expired, other side
+  /// dropped the channel, etc).
+  ///
+  /// Client-side Read, Server-side Read, Client-side
+  /// RecvInitialMetadata (which is typically included in Read if not
+  /// done explicitly): \a ok indicates whether there is a valid message
+  /// that got read. If not, you know that there are certainly no more
+  /// messages that can ever be read from this stream. For the client-side
+  /// operations, this only happens because the call is dead. For the
+  /// server-sider operation, though, this could happen because the client
+  /// has done a WritesDone already.
+  ///
+  /// Client-side Finish: \a ok should always be true
+  ///
+  /// Server-side AsyncNotifyWhenDone: \a ok should always be true
+  ///
+  /// Alarm: \a ok is true if it expired, false if it was canceled
+  ///
+  /// \return true if got an event, false if the queue is fully drained and
+  ///         shut down.
+  bool Next(void** tag, bool* ok) {
+    return (AsyncNextInternal(tag,
+                              ok,
+                              g_core_codegen_interface->gpr_inf_future(
+                                  GPR_CLOCK_REALTIME)) != SHUTDOWN);
+  }
+
+  /// Read from the queue, blocking up to \a deadline (or the queue's shutdown).
+  /// Both \a tag and \a ok are updated upon success (if an event is available
+  /// within the \a deadline).  A \a tag points to an arbitrary location usually
+  /// employed to uniquely identify an event.
+  ///
+  /// \param tag[out] Upon sucess, updated to point to the event's tag.
+  /// \param ok[out] Upon sucess, true if a successful event, false otherwise
+  ///        See documentation for CompletionQueue::Next for explanation of ok
+  /// \param deadline[in] How long to block in wait for an event.
+  ///
+  /// \return The type of event read.
+  template <typename T>
+  NextStatus AsyncNext(void** tag, bool* ok, const T& deadline) {
+    TimePoint<T> deadline_tp(deadline);
+    return AsyncNextInternal(tag, ok, deadline_tp.raw_time());
+  }
+
+  /// EXPERIMENTAL
+  /// First executes \a F, then reads from the queue, blocking up to
+  /// \a deadline (or the queue's shutdown).
+  /// Both \a tag and \a ok are updated upon success (if an event is available
+  /// within the \a deadline).  A \a tag points to an arbitrary location usually
+  /// employed to uniquely identify an event.
+  ///
+  /// \param F[in] Function to execute before calling AsyncNext on this queue.
+  /// \param tag[out] Upon sucess, updated to point to the event's tag.
+  /// \param ok[out] Upon sucess, true if read a regular event, false otherwise.
+  /// \param deadline[in] How long to block in wait for an event.
+  ///
+  /// \return The type of event read.
+  template <typename T, typename F>
+  NextStatus DoThenAsyncNext(F&& f, void** tag, bool* ok, const T& deadline) {
+    CompletionQueueTLSCache cache = CompletionQueueTLSCache(this);
+    f();
+    if (cache.Flush(tag, ok)) {
+      return GOT_EVENT;
+    } else {
+      return AsyncNext(tag, ok, deadline);
+    }
+  }
+
+  /// Request the shutdown of the queue.
+  ///
+  /// \warning This method must be called at some point if this completion queue
+  /// is accessed with Next or AsyncNext. \a Next will not return false
+  /// until this method has been called and all pending tags have been drained.
+  /// (Likewise for \a AsyncNext returning \a NextStatus::SHUTDOWN .)
+  /// Only once either one of these methods does that (that is, once the queue
+  /// has been \em drained) can an instance of this class be destroyed.
+  /// Also note that applications must ensure that no work is enqueued on this
+  /// completion queue after this method is called.
+  void Shutdown();
+
+  /// Returns a \em raw pointer to the underlying \a grpc_completion_queue
+  /// instance.
+  ///
+  /// \warning Remember that the returned instance is owned. No transfer of
+  /// owership is performed.
+  grpc_completion_queue* cq() { return cq_; }
+
+ protected:
+  /// Private constructor of CompletionQueue only visible to friend classes
+  CompletionQueue(const grpc_completion_queue_attributes& attributes) {
+    cq_ = g_core_codegen_interface->grpc_completion_queue_create(
+        g_core_codegen_interface->grpc_completion_queue_factory_lookup(
+            &attributes),
+        &attributes,
+        NULL);
+    InitialAvalanching();  // reserve this for the future shutdown
+  }
+
+ private:
+  // Friend synchronous wrappers so that they can access Pluck(), which is
+  // a semi-private API geared towards the synchronous implementation.
+  template <class R>
+  friend class ::grpc::ClientReader;
+  template <class W>
+  friend class ::grpc::ClientWriter;
+  template <class W, class R>
+  friend class ::grpc::ClientReaderWriter;
+  template <class R>
+  friend class ::grpc::ServerReader;
+  template <class W>
+  friend class ::grpc::ServerWriter;
+  template <class W, class R>
+  friend class ::grpc::internal::ServerReaderWriterBody;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::RpcMethodHandler;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::ClientStreamingHandler;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::ServerStreamingHandler;
+  template <class Streamer, bool WriteNeeded>
+  friend class ::grpc::internal::TemplatedBidiStreamingHandler;
+  friend class ::grpc::internal::UnknownMethodHandler;
+  friend class ::grpc::Server;
+  friend class ::grpc::ServerContext;
+  friend class ::grpc::ServerInterface;
+  template <class InputMessage, class OutputMessage>
+  friend class ::grpc::internal::BlockingUnaryCallImpl;
+
+  /// EXPERIMENTAL
+  /// Creates a Thread Local cache to store the first event
+  /// On this completion queue queued from this thread.  Once
+  /// initialized, it must be flushed on the same thread.
+  class CompletionQueueTLSCache {
+   public:
+    CompletionQueueTLSCache(CompletionQueue* cq);
+    ~CompletionQueueTLSCache();
+    bool Flush(void** tag, bool* ok);
+
+   private:
+    CompletionQueue* cq_;
+    bool flushed_;
+  };
+
+  NextStatus AsyncNextInternal(void** tag, bool* ok, gpr_timespec deadline);
+
+  /// Wraps \a grpc_completion_queue_pluck.
+  /// \warning Must not be mixed with calls to \a Next.
+  bool Pluck(internal::CompletionQueueTag* tag) {
+    auto deadline =
+        g_core_codegen_interface->gpr_inf_future(GPR_CLOCK_REALTIME);
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    GPR_CODEGEN_ASSERT(tag->FinalizeResult(&ignored, &ok));
+    GPR_CODEGEN_ASSERT(ignored == tag);
+    // Ignore mutations by FinalizeResult: Pluck returns the C API status
+    return ev.success != 0;
+  }
+
+  /// Performs a single polling pluck on \a tag.
+  /// \warning Must not be mixed with calls to \a Next.
+  ///
+  /// TODO: sreek - This calls tag->FinalizeResult() even if the cq_ is already
+  /// shutdown. This is most likely a bug and if it is a bug, then change this
+  /// implementation to simple call the other TryPluck function with a zero
+  /// timeout. i.e:
+  ///      TryPluck(tag, gpr_time_0(GPR_CLOCK_REALTIME))
+  void TryPluck(internal::CompletionQueueTag* tag) {
+    auto deadline = g_core_codegen_interface->gpr_time_0(GPR_CLOCK_REALTIME);
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    if (ev.type == GRPC_QUEUE_TIMEOUT) return;
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    // the tag must be swallowed if using TryPluck
+    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
+  }
+
+  /// Performs a single polling pluck on \a tag. Calls tag->FinalizeResult if
+  /// the pluck() was successful and returned the tag.
+  ///
+  /// This exects tag->FinalizeResult (if called) to return 'false' i.e expects
+  /// that the tag is internal not something that is returned to the user.
+  void TryPluck(internal::CompletionQueueTag* tag, gpr_timespec deadline) {
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    if (ev.type == GRPC_QUEUE_TIMEOUT || ev.type == GRPC_QUEUE_SHUTDOWN) {
+      return;
+    }
+
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
+  }
+
+  /// Manage state of avalanching operations : completion queue tags that
+  /// trigger other completion queue operations. The underlying core completion
+  /// queue should not really shutdown until all avalanching operations have
+  /// been finalized. Note that we maintain the requirement that an avalanche
+  /// registration must take place before CQ shutdown (which must be maintained
+  /// elsehwere)
+  void InitialAvalanching() {
+    gpr_atm_rel_store(&avalanches_in_flight_, static_cast<gpr_atm>(1));
+  }
+  void RegisterAvalanching() {
+    gpr_atm_no_barrier_fetch_add(&avalanches_in_flight_,
+                                 static_cast<gpr_atm>(1));
+  }
+  void CompleteAvalanching();
+
+  grpc_completion_queue* cq_;  // owned
+
+  gpr_atm avalanches_in_flight_;
+};
+
+/// A specific type of completion queue used by the processing of notifications
+/// by servers. Instantiated by \a ServerBuilder.
+class ServerCompletionQueue : public CompletionQueue {
+ public:
+  bool IsFrequentlyPolled() { return polling_type_ != GRPC_CQ_NON_LISTENING; }
+
+ private:
+  grpc_cq_polling_type polling_type_;
+  friend class ServerBuilder;
+  /// \param is_frequently_polled Informs the GRPC library about whether the
+  /// server completion queue would be actively polled (by calling Next() or
+  /// AsyncNext()). By default all server completion queues are assumed to be
+  /// frequently polled.
+  ServerCompletionQueue(grpc_cq_polling_type polling_type)
+      : CompletionQueue(grpc_completion_queue_attributes{
+            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, polling_type}),
+        polling_type_(polling_type) {}
+};
+
+}  // namespace grpc
+
+#endif  // GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
diff --git a/patches/grpc/fix_too_early_destory.patch b/patches/grpc/fix_too_early_destory.patch
deleted file mode 100644
index d7790d56b0..0000000000
--- a/patches/grpc/fix_too_early_destory.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-diff --git a/include/grpcpp/impl/codegen/completion_queue.h b/include/grpcpp/impl/codegen/completion_queue.h
-index 80c7c41982..3f7d8a7714 100644
---- a/include/grpcpp/impl/codegen/completion_queue.h
-+++ b/include/grpcpp/impl/codegen/completion_queue.h
-@@ -32,6 +32,8 @@
- #ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
- #define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
- 
-+#include <typeinfo>
-+
- #include <grpc/impl/codegen/atm.h>
- #include <grpcpp/impl/codegen/completion_queue_tag.h>
- #include <grpcpp/impl/codegen/core_codegen_interface.h>
-@@ -106,7 +108,9 @@ class CompletionQueue : private GrpcLibraryCodegen {
- 
-   /// Destructor. Destroys the owned wrapped completion queue / instance.
-   ~CompletionQueue() {
--    g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
-+	if (typeid(*g_core_codegen_interface).hash_code() != typeid(CoreCodegenInterface).hash_code()) {
-+      g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
-+	}
-   }
- 
-   /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
-diff --git a/include/grpcpp/impl/codegen/grpc_library.h b/include/grpcpp/impl/codegen/grpc_library.h
-index 17c904d71a..a092b2204d 100644
---- a/include/grpcpp/impl/codegen/grpc_library.h
-+++ b/include/grpcpp/impl/codegen/grpc_library.h
-@@ -19,6 +19,8 @@
- #ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
- #define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
- 
-+#include <typeinfo>
-+
- #include <grpcpp/impl/codegen/core_codegen_interface.h>
- 
- namespace grpc {
-@@ -47,7 +49,8 @@ class GrpcLibraryCodegen {
-     }
-   }
-   virtual ~GrpcLibraryCodegen() {
--    if (grpc_init_called_) {
-+    if (grpc_init_called_ &&
-+		typeid(*g_glip).hash_code() != typeid(GrpcLibraryInterface).hash_code()) {
-       GPR_CODEGEN_ASSERT(g_glip &&
-                          "gRPC library not initialized. See "
-                          "grpc::internal::GrpcLibraryInitializer.");
diff --git a/patches/grpc/grpc_library.h b/patches/grpc/grpc_library.h
new file mode 100644
index 0000000000..4870a1cda4
--- /dev/null
+++ b/patches/grpc/grpc_library.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+#define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+
+#include <typeinfo>
+
+#include <grpcpp/impl/codegen/core_codegen_interface.h>
+
+namespace grpc {
+
+class GrpcLibraryInterface {
+ public:
+  virtual ~GrpcLibraryInterface() = default;
+  virtual void init() = 0;
+  virtual void shutdown() = 0;
+};
+
+/// Initialized by \a grpc::GrpcLibraryInitializer from
+/// <grpcpp/impl/grpc_library.h>
+extern GrpcLibraryInterface* g_glip;
+
+/// Classes that require gRPC to be initialized should inherit from this class.
+class GrpcLibraryCodegen {
+ public:
+  GrpcLibraryCodegen(bool call_grpc_init = true) : grpc_init_called_(false) {
+    if (call_grpc_init) {
+      GPR_CODEGEN_ASSERT(g_glip &&
+                         "gRPC library not initialized. See "
+                         "grpc::internal::GrpcLibraryInitializer.");
+      g_glip->init();
+      grpc_init_called_ = true;
+    }
+  }
+  virtual ~GrpcLibraryCodegen() {
+    if (grpc_init_called_ &&
+        typeid(*g_glip).hash_code() !=
+            typeid(GrpcLibraryInterface).hash_code()) {
+      GPR_CODEGEN_ASSERT(g_glip &&
+                         "gRPC library not initialized. See "
+                         "grpc::internal::GrpcLibraryInitializer.");
+      g_glip->shutdown();
+    }
+  }
+
+ private:
+  bool grpc_init_called_;
+};
+
+}  // namespace grpc
+
+#endif  // GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index d48c54fcbb..0085096607 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False):
         if drop_last == False and len(b) != 0:
             yield b
 
+    # Batch size check
+    batch_size = int(batch_size)
+    if batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(batch_size))
+
     return batch_reader
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index 79ddd8b7e6..f6b4ff8fbd 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -28,11 +28,12 @@ images per class.
 
 """
 
-import cPickle
 import itertools
 import numpy
 import paddle.dataset.common
 import tarfile
+from six.moves import zip
+from six.moves import cPickle as pickle
 
 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
 
@@ -48,7 +49,7 @@ def reader_creator(filename, sub_name, cycle=False):
         data = batch['data']
         labels = batch.get('labels', batch.get('fine_labels', None))
         assert labels is not None
-        for sample, label in itertools.izip(data, labels):
+        for sample, label in zip(data, labels):
             yield (sample / 255.0).astype(numpy.float32), int(label)
 
     def reader():
@@ -58,7 +59,7 @@ def reader_creator(filename, sub_name, cycle=False):
 
             while True:
                 for name in names:
-                    batch = cPickle.load(f.extractfile(name))
+                    batch = pickle.load(f.extractfile(name))
                     for item in read_batch(batch):
                         yield item
                 if not cycle:
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 68660601c1..6195cc50df 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -20,9 +20,8 @@ import shutil
 import sys
 import importlib
 import paddle.dataset
-import cPickle
+import six.moves.cPickle as pickle
 import glob
-import cPickle as pickle
 
 __all__ = [
     'DATA_HOME',
@@ -75,13 +74,13 @@ def download(url, module_name, md5sum, save_name=None):
     retry_limit = 3
     while not (os.path.exists(filename) and md5file(filename) == md5sum):
         if os.path.exists(filename):
-            print "file md5", md5file(filename), md5sum
+            print("file md5", md5file(filename), md5sum)
         if retry < retry_limit:
             retry += 1
         else:
             raise RuntimeError("Cannot download {0} within retry limit {1}".
                                format(url, retry_limit))
-        print "Cache file %s not found, downloading %s" % (filename, url)
+        print("Cache file %s not found, downloading %s" % (filename, url))
         r = requests.get(url, stream=True)
         total_length = r.headers.get('content-length')
 
@@ -104,8 +103,9 @@ def download(url, module_name, md5sum, save_name=None):
 
 
 def fetch_all():
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.dataset)):
+    for module_name in [
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
         if "fetch" in dir(
                 importlib.import_module("paddle.dataset.%s" % module_name)):
             getattr(
@@ -114,8 +114,9 @@ def fetch_all():
 
 
 def fetch_all_recordio(path):
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.dataset)):
+    for module_name in [
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
         if "convert" in dir(
                 importlib.import_module("paddle.dataset.%s" % module_name)) and \
                 not module_name == "common":
@@ -126,7 +127,7 @@ def fetch_all_recordio(path):
                 "convert")(ds_path)
 
 
-def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
     """
     you can call the function as:
 
@@ -167,7 +168,7 @@ def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
 def cluster_files_reader(files_pattern,
                          trainer_count,
                          trainer_id,
-                         loader=cPickle.load):
+                         loader=pickle.load):
     """
     Create a reader that yield element from the given files, select
     a file set according trainer count and trainer_id
@@ -188,7 +189,7 @@ def cluster_files_reader(files_pattern,
         my_file_list = []
         for idx, fn in enumerate(file_list):
             if idx % trainer_count == trainer_id:
-                print "append file: %s" % fn
+                print("append file: %s" % fn)
                 my_file_list.append(fn)
         for fn in my_file_list:
             with open(fn, "r") as f:
@@ -221,7 +222,7 @@ def convert(output_path, reader, line_count, name_prefix):
         for l in lines:
             # FIXME(Yancey1989):
             # dumps with protocol: pickle.HIGHEST_PROTOCOL
-            writer.write(cPickle.dumps(l))
+            writer.write(pickle.dumps(l))
         writer.close()
 
     lines = []
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index 4e94ce8989..a97c95d067 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -24,18 +24,19 @@ import tarfile
 import gzip
 import itertools
 import paddle.dataset.common
+from six.moves import zip
 
 __all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
 VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
 TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
 EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 
 UNK_IDX = 0
@@ -87,12 +88,12 @@ def corpus_reader(data_path, words_name, props_name):
             sentences = []
             labels = []
             one_seg = []
-            for word, label in itertools.izip(words_file, props_file):
+            for word, label in zip(words_file, props_file):
                 word = word.strip()
                 label = label.strip().split()
 
                 if len(label) == 0:  # end of sentence
-                    for i in xrange(len(one_seg[0])):
+                    for i in range(len(one_seg[0])):
                         a_kind_lable = [x[i] for x in one_seg]
                         labels.append(a_kind_lable)
 
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 2354987d20..914dae348b 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -28,10 +28,9 @@ Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 
 """
-import cPickle
 import itertools
 import functools
-from common import download
+from .common import download
 import tarfile
 import scipy.io as scio
 from paddle.dataset.image import *
@@ -39,6 +38,8 @@ from paddle.reader import *
 import os
 import numpy as np
 from multiprocessing import cpu_count
+from six.moves import cPickle as pickle
+from six.moves import zip
 __all__ = ['train', 'test', 'valid']
 
 DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
@@ -116,10 +117,10 @@ def reader_creator(data_file,
                 file = file.strip()
                 batch = None
                 with open(file, 'r') as f:
-                    batch = cPickle.load(f)
+                    batch = pickle.load(f)
                 data = batch['data']
                 labels = batch['label']
-                for sample, label in itertools.izip(data, batch['label']):
+                for sample, label in zip(data, batch['label']):
                     yield sample, int(label) - 1
             if not cycle:
                 break
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 9235c41e9e..3b3d89c93c 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -36,7 +36,7 @@ except ImportError:
     cv2 = None
 import os
 import tarfile
-import cPickle
+import six.moves.cPickle as pickle
 
 __all__ = [
     "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
@@ -86,10 +86,10 @@ def batch_images_from_tar(data_file,
                 output = {}
                 output['label'] = labels
                 output['data'] = data
-                cPickle.dump(
+                pickle.dump(
                     output,
                     open('%s/batch_%d' % (out_path, file_id), 'w'),
-                    protocol=cPickle.HIGHEST_PROTOCOL)
+                    protocol=pickle.HIGHEST_PROTOCOL)
                 file_id += 1
                 data = []
                 labels = []
@@ -97,10 +97,10 @@ def batch_images_from_tar(data_file,
         output = {}
         output['label'] = labels
         output['data'] = data
-        cPickle.dump(
+        pickle.dump(
             output,
             open('%s/batch_%d' % (out_path, file_id), 'w'),
-            protocol=cPickle.HIGHEST_PROTOCOL)
+            protocol=pickle.HIGHEST_PROTOCOL)
 
     with open(meta_file, 'a') as meta:
         for file in os.listdir(out_path):
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index 5ff05b1e9b..e7fe4e0b7e 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -42,13 +42,13 @@ def tokenize(pattern):
         # sequential access of member files, other than
         # tarfile.extractfile, which does random access and might
         # destroy hard disks.
-        tf = tarf.next()
+        tf = next(tarf)
         while tf != None:
             if bool(pattern.match(tf.name)):
                 # newline and punctuations removal and ad-hoc tokenization.
                 yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
                     None, string.punctuation).lower().split()
-            tf = tarf.next()
+            tf = next(tarf)
 
 
 def build_dict(pattern, cutoff):
@@ -62,11 +62,11 @@ def build_dict(pattern, cutoff):
             word_freq[word] += 1
 
     # Not sure if we should prune less-frequent words here.
-    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+    word_freq = [x for x in list(word_freq.items()) if x[1] > cutoff]
 
     dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
     words, _ = list(zip(*dictionary))
-    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx = dict(list(zip(words, list(range(len(words))))))
     word_idx['<unk>'] = len(words)
     return word_idx
 
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
index c6c0a0f543..bc007c9d3c 100644
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -64,11 +64,11 @@ def build_dict(min_word_freq=50):
             # remove <unk> for now, since we will set it as last index
             del word_freq['<unk>']
 
-        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+        word_freq = [x for x in list(word_freq.items()) if x[1] > min_word_freq]
 
         word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
         words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx = dict(list(zip(words, list(range(len(words))))))
         word_idx['<unk>'] = len(words)
 
     return word_idx
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 6259cc35b4..ffa9008c80 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -65,7 +65,7 @@ def reader_creator(image_filename, label_filename, buffer_size):
 
                 images = images / 255.0 * 2.0 - 1.0
 
-                for i in xrange(buffer_size):
+                for i in range(buffer_size):
                     yield images[i, :], int(labels[i])
         finally:
             try:
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index ab11716202..056ec21786 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -16,7 +16,7 @@ Movielens 1-M dataset.
 
 Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
 movies, which was collected by GroupLens Research. This module will download
-Movielens 1-M dataset from 
+Movielens 1-M dataset from
 http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
 set and test set into paddle reader creators.
 
@@ -187,7 +187,7 @@ def max_movie_id():
     Get the maximum value of movie id.
     """
     __initialize_meta_info__()
-    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+    return reduce(__max_index_info__, list(MOVIE_INFO.values())).index
 
 
 def max_user_id():
@@ -195,7 +195,7 @@ def max_user_id():
     Get the maximum value of user id.
     """
     __initialize_meta_info__()
-    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+    return reduce(__max_index_info__, list(USER_INFO.values())).index
 
 
 def __max_job_id_impl__(a, b):
@@ -210,7 +210,7 @@ def max_job_id():
     Get the maximum value of job id.
     """
     __initialize_meta_info__()
-    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+    return reduce(__max_job_id_impl__, list(USER_INFO.values())).job_id
 
 
 def movie_categories():
@@ -243,7 +243,7 @@ def unittest():
     for test_count, _ in enumerate(test()()):
         pass
 
-    print train_count, test_count
+    print(train_count, test_count)
 
 
 def fetch():
diff --git a/python/paddle/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
index d3b3dd524c..cc4d088316 100644
--- a/python/paddle/dataset/mq2007.py
+++ b/python/paddle/dataset/mq2007.py
@@ -26,7 +26,7 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
 import os
 import functools
 import rarfile
-from common import download
+from .common import download
 import numpy as np
 
 # URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
@@ -53,7 +53,7 @@ class Query(object):
   ----------
   query_id : int
     query_id in dataset, mapping from query to relevance documents
-  relevance_score : int 
+  relevance_score : int
     relevance score of query and document pair
   feature_vector : array, dense feature
     feature in vector format
@@ -92,7 +92,7 @@ class Query(object):
             sys.stdout.write("expect 48 space split parts, get %d" %
                              (len(parts)))
             return None
-        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        # format : 0 qid:10 1:0.000272 2:0.000000 ....
         self.relevance_score = int(parts[0])
         self.query_id = int(parts[1].split(':')[1])
         for p in parts[2:]:
@@ -295,7 +295,7 @@ def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
   --------
   filename : string
   fill_missing : fill the missing value. default in MQ2007 is -1
-  
+
   Returns
   ------
   yield
@@ -330,4 +330,4 @@ if __name__ == "__main__":
     mytest = functools.partial(
         __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
     for label, query in mytest():
-        print label, query
+        print(label, query)
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
index f5461164fe..953ada057b 100644
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -43,11 +43,11 @@ def download_data_if_not_yet():
             nltk.data.path.append(paddle.dataset.common.DATA_HOME)
         movie_reviews.categories()
     except LookupError:
-        print "Downloading movie_reviews data set, please wait....."
+        print("Downloading movie_reviews data set, please wait.....")
         nltk.download(
             'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-        print "Download data set success....."
-        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+        print("Download data set success.....")
+        print("Path is " + nltk.data.find('corpora/movie_reviews').path)
 
 
 def get_word_dict():
@@ -64,7 +64,7 @@ def get_word_dict():
         for field in movie_reviews.fileids(category):
             for words in movie_reviews.words(field):
                 word_freq_dict[words] += 1
-    words_sort_list = word_freq_dict.items()
+    words_sort_list = list(word_freq_dict.items())
     words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
     for index, word in enumerate(words_sort_list):
         words_freq_sorted.append((word[0], index))
@@ -80,7 +80,8 @@ def sort_files():
     files_list = list()
     neg_file_list = movie_reviews.fileids('neg')
     pos_file_list = movie_reviews.fileids('pos')
-    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    files_list = list(
+        chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
     return files_list
 
 
diff --git a/python/paddle/dataset/tests/common_test.py b/python/paddle/dataset/tests/common_test.py
index e7cc02aa83..777cd06a19 100644
--- a/python/paddle/dataset/tests/common_test.py
+++ b/python/paddle/dataset/tests/common_test.py
@@ -36,7 +36,7 @@ class TestCommon(unittest.TestCase):
     def test_split(self):
         def test_reader():
             def reader():
-                for x in xrange(10):
+                for x in range(10):
                     yield x
 
             return reader
@@ -49,7 +49,7 @@ class TestCommon(unittest.TestCase):
 
     def test_cluster_file_reader(self):
         _, temp_path = tempfile.mkstemp()
-        for x in xrange(5):
+        for x in range(5):
             with open(temp_path + '/%05d.test' % x) as f:
                 f.write('%d\n' % x)
         reader = paddle.dataset.common.cluster_files_reader(
@@ -63,7 +63,7 @@ class TestCommon(unittest.TestCase):
 
         def test_reader():
             def reader():
-                for x in xrange(record_num):
+                for x in range(record_num):
                     yield x
 
             return reader
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
index 233fd9fc8c..50f50d947d 100644
--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -59,7 +59,7 @@ class TestMikolov(unittest.TestCase):
         self.assertEqual(first_line, read_line)
 
     def test_total(self):
-        _, idx = zip(*WORD_DICT.items())
+        _, idx = list(zip(*list(WORD_DICT.items())))
         self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
 
 
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
index 543f4b7378..37326517f7 100644
--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -24,9 +24,8 @@ from nltk.corpus import movie_reviews
 class TestSentimentMethods(unittest.TestCase):
     def test_get_word_dict(self):
         word_dict = st.get_word_dict()[0:10]
-        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
-                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
-                          (u'is', 8), (u'in', 9)]
+        test_word_list = [(',', 0), ('the', 1), ('.', 2), ('a', 3), ('and', 4),
+                          ('of', 5), ('to', 6), ("'", 7), ('is', 8), ('in', 9)]
         for idx, each in enumerate(word_dict):
             self.assertEqual(each, test_word_list[idx])
         self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index fbfa477d05..410ca7af0d 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -49,9 +49,12 @@ def feature_range(maximums, minimums):
     import matplotlib.pyplot as plt
     fig, ax = plt.subplots()
     feature_num = len(maximums)
-    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.bar(list(range(feature_num)),
+           maximums - minimums,
+           color='r',
+           align='center')
     ax.set_title('feature scale')
-    plt.xticks(range(feature_num), feature_names)
+    plt.xticks(list(range(feature_num)), feature_names)
     plt.xlim([-1, feature_num])
     fig.set_figheight(6)
     fig.set_figwidth(10)
@@ -71,7 +74,7 @@ def load_data(filename, feature_num=14, ratio=0.8):
     maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
         axis=0) / data.shape[0]
     feature_range(maximums[:-1], minimums[:-1])
-    for i in xrange(feature_num - 1):
+    for i in range(feature_num - 1):
         data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
     offset = int(data.shape[0] * ratio)
     UCI_TRAIN_DATA = data[:offset]
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index f0908c7378..c9a257ba3f 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -40,7 +40,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
              'wmt_shrinked_data/wmt14.tgz')
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
 # BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
 MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
@@ -154,8 +154,8 @@ def get_dict(dict_size, reverse=True):
     tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
     src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
     if reverse:
-        src_dict = {v: k for k, v in src_dict.items()}
-        trg_dict = {v: k for k, v in trg_dict.items()}
+        src_dict = {v: k for k, v in list(src_dict.items())}
+        trg_dict = {v: k for k, v in list(trg_dict.items())}
     return src_dict, trg_dict
 
 
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 540d43b692..4e3c466c38 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -70,7 +70,9 @@ def __build_dict(tar_file, dict_size, save_path, lang):
         fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
         for idx, word in enumerate(
                 sorted(
-                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+                    iter(list(word_dict.items())),
+                    key=lambda x: x[1],
+                    reverse=True)):
             if idx + 3 == dict_size: break
             fout.write("%s\n" % (word[0]))
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 9903047f74..cccb4abe6c 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -14,81 +14,81 @@
 
 from __future__ import print_function
 # import all class inside framework into fluid module
-import framework
-from framework import *
+from . import framework
+from .framework import *
 # import all class inside executor into fluid module
-import executor
-from executor import *
-
-import trainer
-from trainer import Trainer
-from trainer import BeginEpochEvent
-from trainer import EndEpochEvent
-from trainer import BeginStepEvent
-from trainer import EndStepEvent
-from trainer import CheckpointConfig
-
-import inferencer
-from inferencer import Inferencer
-
-import io
-import evaluator
-import initializer
-import layers
-import contrib
-import nets
-import optimizer
-import backward
-import regularizer
-import average
-import metrics
-import transpiler
-from param_attr import ParamAttr, WeightNormParamAttr
-from data_feeder import DataFeeder
-from core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
-from transpiler import DistributeTranspiler, InferenceTranspiler, \
+from . import executor
+from .executor import *
+
+from . import trainer
+from .trainer import Trainer
+from .trainer import BeginEpochEvent
+from .trainer import EndEpochEvent
+from .trainer import BeginStepEvent
+from .trainer import EndStepEvent
+from .trainer import CheckpointConfig
+
+from . import inferencer
+from .inferencer import Inferencer
+
+from . import io
+from . import evaluator
+from . import initializer
+from . import layers
+from . import contrib
+from . import nets
+from . import optimizer
+from . import backward
+from . import regularizer
+from . import average
+from . import metrics
+from . import transpiler
+from .param_attr import ParamAttr, WeightNormParamAttr
+from .data_feeder import DataFeeder
+from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
+from .transpiler import DistributeTranspiler, InferenceTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
-from concurrency import (Go, make_channel, channel_send, channel_recv,
-                         channel_close, Select)
-from lod_tensor import create_lod_tensor, create_random_int_lodtensor
-import clip
-import profiler
-import unique_name
-import recordio_writer
-import parallel_executor
-from parallel_executor import *
+from .concurrency import (Go, make_channel, channel_send, channel_recv,
+                          channel_close, Select)
+from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
+from . import clip
+from . import profiler
+from . import unique_name
+from . import recordio_writer
+from . import parallel_executor
+from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
-          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-          parallel_executor.__all__ + lod_tensor.__all__ + [
-              'io',
-              'initializer',
-              'layers',
-              'contrib',
-              'transpiler',
-              'nets',
-              'optimizer',
-              'learning_rate_decay',
-              'backward',
-              'regularizer',
-              'LoDTensor',
-              'LoDTensorArray',
-              'CPUPlace',
-              'CUDAPlace',
-              'CUDAPinnedPlace',
-              'Tensor',
-              'ParamAttr',
-              'WeightNormParamAttr',
-              'DataFeeder',
-              'clip',
-              'profiler',
-              'unique_name',
-              'recordio_writer',
-              'Scope',
-          ]
+    trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
+    parallel_executor.__all__ + lod_tensor.__all__ + [
+        'io',
+        'initializer',
+        'layers',
+        'contrib',
+        'transpiler',
+        'nets',
+        'optimizer',
+        'learning_rate_decay',
+        'backward',
+        'regularizer',
+        'LoDTensor',
+        'LoDTensorArray',
+        'CPUPlace',
+        'CUDAPlace',
+        'CUDAPinnedPlace',
+        'Tensor',
+        'ParamAttr',
+        'WeightNormParamAttr',
+        'DataFeeder',
+        'clip',
+        'profiler',
+        'unique_name',
+        'recordio_writer',
+        'Scope',
+    ]
 
 
 def __bootstrap__():
@@ -99,8 +99,8 @@ def __bootstrap__():
         None
     """
     import sys
-    import core
     import os
+    from . import core
 
     in_test = 'unittest' in sys.modules
 
@@ -123,10 +123,13 @@ def __bootstrap__():
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
         'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
-        'init_allocated_mem'
+        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
+        'cpu_deterministic'
     ]
     if core.is_compiled_with_dist():
         read_env_flags.append('rpc_deadline')
+        read_env_flags.append('rpc_server_profile_period')
+        read_env_flags.append('rpc_server_profile_path')
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
diff --git a/python/paddle/fluid/annotations.py b/python/paddle/fluid/annotations.py
index bb8756a466..15e7976354 100644
--- a/python/paddle/fluid/annotations.py
+++ b/python/paddle/fluid/annotations.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
 import functools
 import sys
 
@@ -28,7 +29,7 @@ def deprecated(since, instead, extra_message=""):
 
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
-            print >> sys.stderr, err_msg
+            print(err_msg, file=sys.stderr)
             return func(*args, **kwargs)
 
         wrapper.__doc__ += "\n    "
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 812f68bdd8..6b73974511 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -16,7 +16,8 @@ from paddle.fluid import framework as framework
 from . import core
 import collections
 import copy
-import unique_name
+import six
+from . import unique_name
 
 __all__ = ['append_backward']
 
@@ -44,17 +45,25 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
     """
     op_desc = core.OpDesc()
     op_desc.set_type(op_type)
-    for para, args in inputs.iteritems():
-        op_desc.set_input(para, args)
-    for para, args in outputs.iteritems():
-        op_desc.set_output(para, args)
+    for para, args in list(inputs.items()):
+        op_desc.set_input(
+            para,
+            list(
+                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
+                    args)))
+    for para, args in list(outputs.items()):
+        op_desc.set_output(
+            para,
+            list(
+                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
+                    args)))
 
     op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
     if op_role_attr_name not in attrs:
         attrs[
             op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
-    for name, val in attrs.iteritems():
+    for name, val in list(attrs.items()):
         if isinstance(val, framework.Block):
             op_desc.set_block_attr(name, val.desc)
         else:
@@ -105,7 +114,9 @@ def _strip_grad_suffix_(name):
     e.g. x@GRAD ==> x
          y@GRAD@RENAME@1 ==> y
     """
-    pos = name.find(core.grad_var_suffix())
+    if isinstance(name, six.text_type):
+        name = name.encode()
+    pos = name.find(six.b(core.grad_var_suffix()))
     return name[:pos] if pos != -1 else name
 
 
@@ -114,7 +125,9 @@ def _append_grad_suffix_(name):
     Append grad suffix to the given variable name
     e.g. x ==> x@GRAD
     """
-    return name + core.grad_var_suffix()
+    if isinstance(name, six.text_type):
+        name = name.encode()
+    return name + six.b(core.grad_var_suffix())
 
 
 def _addup_repetitive_outputs_(op_descs):
@@ -174,7 +187,7 @@ def _addup_repetitive_outputs_(op_descs):
                     op_desc.set_output(param_name, arg_names)
                     renamed_vars[var_name].append(new_name)
 
-    for var_name, inputs in renamed_vars.iteritems():
+    for var_name, inputs in list(renamed_vars.items()):
         if len(inputs) > 1:
             pending_sum_ops.append(
                 (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
@@ -198,16 +211,19 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
         out_arg_names = op_desc.output_arg_names()
         if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
             return True
-        if _all_in_set_(
-                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
-                       op_desc.input_arg_names()), no_grad_set):
+        if _all_in_set_([
+                name for name in op_desc.input_arg_names()
+                if name.find(core.grad_var_suffix()) != -1
+        ], no_grad_set):
             no_grad_set.update(out_arg_names)
             return True
         return False
 
     # Remove ops whose outputs are all in no_grad_dict
-    op_descs = filter(
-        lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
+    op_descs = [
+        op_desc for op_desc in op_descs
+        if not _op_can_be_removed_(op_desc, no_grad_set)
+    ]
     # Insert fill_zeros_like_op
     to_insert = []
     for idx, op_desc in enumerate(op_descs):
@@ -217,12 +233,12 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
                     "X": [_strip_grad_suffix_(arg)]
                 }, {"Out": [arg]}, {}), idx))
 
-    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+    list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
 
     return op_descs
 
 
-import proto.framework_pb2 as framework_pb2
+from .proto import framework_pb2
 
 
 def serialize_op_decs(op_desc):
@@ -244,8 +260,10 @@ def _callback_lookup_(op):
     if op.type == 'parallel_do' and op.attr('use_nccl'):
         all_vars = op.block.vars
         param_names = set(op.input('parameters'))
-        param_names = filter(lambda name: all_vars[name].stop_gradient is False,
-                             param_names)
+        param_names = [
+            name for name in param_names
+            if all_vars[name].stop_gradient is False
+        ]
         param_grad_names = [n + "@GRAD" for n in param_names]
 
         class ParallelDoCallBack(object):
@@ -399,7 +417,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
                 continue
             block.desc.var(grad_var_name)
             new_vars.add(grad_var_name)
-            if not grad_to_var.has_key(grad_var_name):
+            if grad_var_name not in grad_to_var:
                 continue
             grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
         # infer_shape and infer_type
@@ -427,7 +445,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
                 op_desc.rename_output(name, new_name)
                 var_map[name] = new_name
 
-    for g, ng in var_map.iteritems():
+    for g, ng in list(var_map.items()):
         if g in grad_to_var:
             grad_to_var[ng] = grad_to_var[g]
             grad_to_var.pop(g)
@@ -439,7 +457,7 @@ def _get_stop_gradients_(program):
     for block in program.blocks:
         assert isinstance(block, framework.Block)
         block_no_grad_set = set()
-        for var in block.vars.itervalues():
+        for var in list(block.vars.values()):
             assert isinstance(var, framework.Variable)
             if var.stop_gradient:
                 block_no_grad_set.add(_append_grad_suffix_(var.name))
@@ -452,51 +470,51 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     """
     Append backward part to main_program.
 
-    A complete neural network training is made up of forward and backward 
-    propagation. However, when we configure a network, we only need to 
-    specify its forwrd part. The backward part is generated automatically 
+    A complete neural network training is made up of forward and backward
+    propagation. However, when we configure a network, we only need to
+    specify its forwrd part. The backward part is generated automatically
     according to the forward part by this function.
 
-    In most cases, users do not need to invoke this function manually. It 
+    In most cases, users do not need to invoke this function manually. It
     will be automatically invoked by the optimizer's `minimize` function.
 
     Args:
         loss(Variable): The loss variable of the network.
-        parameter_list(list[string]|None): Names of parameters that need 
-                                           to be updated by optimizers. 
-                                           If it is None, all parameters 
+        parameter_list(list[string]|None): Names of parameters that need
+                                           to be updated by optimizers.
+                                           If it is None, all parameters
                                            will be updated.
                                            Default: None
-        no_grad_set(set|None): Variables in the Block 0 whose gradients 
-                               should be ignored. All variables with 
-                               `step_gradient=True` from all blocks will 
+        no_grad_set(set|None): Variables in the Block 0 whose gradients
+                               should be ignored. All variables with
+                               `step_gradient=True` from all blocks will
                                be automatically added into this set.
                                Default: None
-        callbacks(list[callable object]|None): The callbacks are used for 
-                                               doing some custom jobs during 
-                                               backward part building. All 
-                                               callable objects in it will 
-                                               be invoked once each time a 
-                                               new gradient operator is added 
-                                               into the program. The callable 
-                                               object must has two input 
-                                               parameters: 'block' and 'context'. 
-                                               The 'block' is the block which 
-                                               the new gradient operator will 
-                                               be added to. The 'context' is a 
-                                               map, whose keys are gradient 
-                                               variable names and values are 
+        callbacks(list[callable object]|None): The callbacks are used for
+                                               doing some custom jobs during
+                                               backward part building. All
+                                               callable objects in it will
+                                               be invoked once each time a
+                                               new gradient operator is added
+                                               into the program. The callable
+                                               object must has two input
+                                               parameters: 'block' and 'context'.
+                                               The 'block' is the block which
+                                               the new gradient operator will
+                                               be added to. The 'context' is a
+                                               map, whose keys are gradient
+                                               variable names and values are
                                                corresponding original variables.
-                                               In addition to this, the 'context' 
-                                               has another special key-value pair: 
-                                               the key is string '__current_op_desc__' 
-                                               and the value is the op_desc of the 
-                                               gradient operator who has just 
-                                               triggered the callable object. 
+                                               In addition to this, the 'context'
+                                               has another special key-value pair:
+                                               the key is string '__current_op_desc__'
+                                               and the value is the op_desc of the
+                                               gradient operator who has just
+                                               triggered the callable object.
 
     Returns:
-        list[(Variable,Variable)]: Pairs of parameter and its 
-        corresponding gradients. The key is the parameter and the 
+        list[(Variable,Variable)]: Pairs of parameter and its
+        corresponding gradients. The key is the parameter and the
         value is gradient variable.
 
     Raises:
@@ -535,7 +553,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
         no_grad_set = set()
     no_grad_set = copy.copy(no_grad_set)
     no_grad_dict = _get_stop_gradients_(program)
-    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
 
     grad_info_map = dict()
     root_block = program.block(0)
@@ -558,7 +576,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
     op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
-    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
 
     _append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
                           grad_to_var, callbacks)
@@ -572,8 +590,6 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
 
     program.current_block_idx = current_block_idx
     program._sync_with_cpp()
-    # FIXME(zcd): prevent loss.grad optimized by mem_opt.
-    loss.block.var(_append_grad_suffix_(loss.name)).persistable = True
 
     if parameter_list is not None:
         parameters = parameter_list
@@ -699,7 +715,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
         no_grad_set = set()
     no_grad_set = copy.copy(no_grad_set)
     no_grad_dict = _get_stop_gradients_(prog)
-    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
 
     fwd_op_num = block.desc.op_size()
 
@@ -733,7 +749,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
     op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
-    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
     grad_to_var = dict()
     grad_info_map = dict()
     _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index c029662ebc..4b0a792f78 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 import copy
+import six
 
 import functools
-import layers
-import framework
+from . import layers
+from . import framework
 from . import core
 
 __all__ = [
@@ -80,8 +81,7 @@ def error_clip_callback(block, context):
     # the context is a grad_to_var map
     grad_to_var = context
     op_desc = block.desc.op(block.desc.op_size() - 1)
-    for grad_n in filter(lambda n: grad_to_var.has_key(n),
-                         op_desc.output_arg_names()):
+    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
         fwd_var = block._var_recursive(grad_to_var[grad_n])
         error_clip = getattr(fwd_var, "error_clip", None)
         if not (error_clip is None or isinstance(error_clip,
@@ -247,8 +247,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
     """
 
     def __init__(self, clip_norm, group_name="default_group"):
-        if not isinstance(group_name, basestring):
-            raise TypeError("'group_name' must be a basestring.")
+        if not isinstance(group_name, six.string_types):
+            raise TypeError("'group_name' must be a %s." % (six.string_types))
 
         self.clip_norm = clip_norm
         self.group_name = group_name
@@ -284,7 +284,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                 x=clip_var,
                 y=layers.elementwise_max(
                     x=clip_var, y=group_norm_var))
-            assert group_scale_var.shape == (1L, )
+            assert group_scale_var.shape == (1, )
             self.context[group_scale_name] = group_scale_var
 
         new_grad = layers.elementwise_mul(
@@ -313,7 +313,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
         program = framework.default_main_program()
     if param_list is None:
         param_list = program.block(0).all_parameters()
-    if all(isinstance(elem, basestring) for elem in param_list):
+    if all(isinstance(elem, six.string_types) for elem in param_list):
         param_list = [program.block(0).var(elem) for elem in param_list]
     if not all(isinstance(elem, framework.Parameter) for elem in param_list):
         raise TypeError(
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index b8fe9bd4c1..a8c4d66720 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from layers.control_flow import BlockGuard, equal
+from .layers.control_flow import BlockGuard, equal
 from .framework import Operator
-from layer_helper import LayerHelper, unique_name
-from layers import fill_constant
-import core
+from .layer_helper import LayerHelper, unique_name
+from .layers import fill_constant
+from . import core
 
 __all__ = [
     'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close',
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 12cd5d918e..58f2da1c3b 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import decoder
-from decoder import *
+from . import decoder
+from .decoder import *
+from . import memory_usage_calc
+from .memory_usage_calc import *
 
-__all__ = decoder.__all__
+__all__ = decoder.__all__ + memory_usage_calc.__all__
diff --git a/python/paddle/fluid/contrib/decoder/__init__.py b/python/paddle/fluid/contrib/decoder/__init__.py
index 22cfe69269..6343c1543d 100644
--- a/python/paddle/fluid/contrib/decoder/__init__.py
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import beam_search_decoder
-from beam_search_decoder import *
+from . import beam_search_decoder
+from .beam_search_decoder import *
 
 __all__ = beam_search_decoder.__all__
diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
index ba6e138782..d268a948f7 100644
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -22,6 +22,7 @@ This API is still under active development and may change drastically.
 
 import contextlib
 import numpy as np
+import six
 
 from ... import layers
 from ...framework import Variable
@@ -191,7 +192,7 @@ class StateCell(object):
         self._helper = LayerHelper('state_cell', name=name)
         self._cur_states = {}
         self._state_names = []
-        for state_name, state in states.items():
+        for state_name, state in six.iteritems(states):
             if not isinstance(state, InitState):
                 raise ValueError('state must be an InitState object.')
             self._cur_states[state_name] = state
@@ -346,7 +347,7 @@ class StateCell(object):
         if self._in_decoder and not self._switched_decoder:
             self._switch_decoder()
 
-        for input_name, input_value in inputs.items():
+        for input_name, input_value in six.iteritems(inputs):
             if input_name not in self._inputs:
                 raise ValueError('Unknown input %s. '
                                  'Please make sure %s in input '
@@ -361,7 +362,7 @@ class StateCell(object):
         if self._in_decoder and not self._switched_decoder:
             self._switched_decoder()
 
-        for state_name, decoder_state in self._states_holder.items():
+        for state_name, decoder_state in six.iteritems(self._states_holder):
             if id(self._cur_decoder_obj) not in decoder_state:
                 raise ValueError('Unknown decoder object, please make sure '
                                  'switch_decoder been invoked.')
@@ -671,7 +672,7 @@ class BeamSearchDecoder(object):
             feed_dict = {}
             update_dict = {}
 
-            for init_var_name, init_var in self._input_var_dict.items():
+            for init_var_name, init_var in six.iteritems(self._input_var_dict):
                 if init_var_name not in self.state_cell._inputs:
                     raise ValueError('Variable ' + init_var_name +
                                      ' not found in StateCell!\n')
@@ -721,7 +722,8 @@ class BeamSearchDecoder(object):
                     self.state_cell.update_states()
                     self.update_array(prev_ids, selected_ids)
                     self.update_array(prev_scores, selected_scores)
-                    for update_name, var_to_update in update_dict.items():
+                    for update_name, var_to_update in six.iteritems(
+                            update_dict):
                         self.update_array(var_to_update, feed_dict[update_name])
 
     def read_array(self, init, is_ids=False, is_scores=False):
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
new file mode 100644
index 0000000000..5da846edb6
--- /dev/null
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module privides a memory usage calculate function for user.
+The purpose of this API is to allow users to estimate memory usage of
+a program under a special batch size, then user can set appropriate 
+batch size to fully utilize a GPU. 
+
+This API is still under active development and may change drastically.
+"""
+
+from .. import core
+from ..framework import Program, Variable
+
+__all__ = ['memory_usage']
+
+dtype_to_size = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
+DEBUG = False
+
+
+def memory_usage(program, batch_size):
+    """
+    Get the estimate memory usage of program with input batch size.
+
+    Args:
+        program(Program): The current Program.
+        batch_size(int): The current input data batch_size.  
+    
+    Returns:
+        min_total_memory(float): the estimate memory usage lower bound.
+        max_total_memory(float): the estimate memory usage upper bound.
+        unit_str(string): the unit of estimate usage result.
+    
+    Examples:
+        
+        >>> import paddle.fluid as fluid
+        >>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
+                fluid.default_main_program(), batch_size=10)
+        >>> print "memory usage is about %.3f - %.3f %s" % \
+                (lower_usage, upper_usage, unit)
+
+    """
+
+    # Parameters check
+    if not isinstance(program, Program):
+        raise TypeError(
+            "Calculating Memory Usage requires Program as its Parameter."
+            "But you passed in %s" % (type(prgram)))
+    if batch_size <= 0:
+        raise ValueError("The batch size need to be positive.")
+
+    # Get the var_name list of first block and calculate
+    total_memory = 0.0
+    for var in program.global_block().vars.itervalues():
+        data_count = 1
+        for x in var.shape:
+            if x == -1:
+                data_count *= batch_size
+            else:
+                data_count *= x
+        var_memory = data_count * dtype_to_size[var.dtype]
+        if DEBUG:
+            print "%s memory usage: %d" % (var.name, var_memory)
+        total_memory += var_memory
+    if DEBUG:
+        print "total memory usage: %.2f" % (total_memory)
+
+    # Convert appropriate unit
+    unit_str = "B"
+    if total_memory > 1024:
+        total_memory /= 1024
+        unit_str = "KB"
+        if total_memory > 1024:
+            total_memory /= 1024
+            unit_str = "MB"
+
+    # Append extra memory consumption (5% - 10%)
+    min_total_memory = total_memory * 1.05
+    max_total_memory = total_memory * 1.1
+
+    return min_total_memory, max_total_memory, unit_str
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index c859778b37..9452cf0e2a 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-import core
+from . import core
 import numpy
 import os
-import six.moves as six
+import six
+from six.moves import zip, range, xrange
 import multiprocessing
 
-from framework import Variable, default_main_program
+from .framework import Variable, default_main_program
 
 __all__ = ['DataFeeder']
 
@@ -53,7 +53,7 @@ class DataToLoDTensorConverter(object):
         self.data = []
         self.lod = []
 
-        for i in six.range(lod_level):
+        for i in six.moves.range(lod_level):
             self.lod.append([])
 
     def feed(self, data):
@@ -142,7 +142,7 @@ class DataFeeder(object):
         if program is None:
             program = default_main_program()
         for each_var in feed_list:
-            if isinstance(each_var, basestring):
+            if isinstance(each_var, six.string_types):
                 each_var = program.block(0).var(each_var)
             if not isinstance(each_var, Variable):
                 raise TypeError("Feed list should contain a list of variable")
@@ -174,7 +174,7 @@ class DataFeeder(object):
             dict: the result of conversion.
         """
         converter = []
-        for lod_level, shape, dtype in six.zip(
+        for lod_level, shape, dtype in six.moves.zip(
                 self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
             converter.append(
                 DataToLoDTensorConverter(
@@ -187,10 +187,12 @@ class DataFeeder(object):
             assert len(each_sample) == len(converter), (
                 "The number of fields in data (%s) does not match " +
                 "len(feed_list) (%s)") % (len(each_sample), len(converter))
-            for each_converter, each_slot in six.zip(converter, each_sample):
+            for each_converter, each_slot in six.moves.zip(converter,
+                                                           each_sample):
                 each_converter.feed(each_slot)
         ret_dict = {}
-        for each_name, each_converter in six.zip(self.feed_names, converter):
+        for each_name, each_converter in six.moves.zip(self.feed_names,
+                                                       converter):
             ret_dict[each_name] = each_converter.done()
         return ret_dict
 
@@ -212,12 +214,14 @@ class DataFeeder(object):
         if isinstance(self.place, core.CUDAPlace):
             places = [
                 core.CUDAPlace(i)
-                for i in six.xrange(self._get_number_of_places_(num_places))
+                for i in six.moves.xrange(
+                    self._get_number_of_places_(num_places))
             ]
         else:
             places = [
                 core.CPUPlace()
-                for _ in six.xrange(self._get_number_of_places_(num_places))
+                for _ in six.moves.xrange(
+                    self._get_number_of_places_(num_places))
             ]
 
         if len(iterable) != len(places):
@@ -227,7 +231,7 @@ class DataFeeder(object):
                              "must be same.")
 
         place = self.place
-        for p, batch in six.zip(places, iterable):
+        for p, batch in six.moves.zip(places, iterable):
             self.place = p
             yield self.feed(batch)
         self.place = place
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
index 1c56064a1e..b7a92cf044 100644
--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -14,8 +14,8 @@
 
 import sys
 import re
-from graphviz import GraphPreviewGenerator
-import proto.framework_pb2 as framework_pb2
+from .graphviz import GraphPreviewGenerator
+from .proto import framework_pb2
 from google.protobuf import text_format
 
 _vartype2str_ = [
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 00ba1a0457..c0671cce9a 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -15,11 +15,11 @@
 import warnings
 import numpy as np
 
-import layers
-from framework import Program, Variable, program_guard
-import unique_name
-from layer_helper import LayerHelper
-from initializer import Constant
+from . import layers
+from .framework import Program, Variable, program_guard
+from . import unique_name
+from .layer_helper import LayerHelper
+from .initializer import Constant
 
 __all__ = [
     'ChunkEvaluator',
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 4178971398..35da1d06a2 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -14,7 +14,8 @@
 
 import numpy as np
 import contextlib
-from framework import Program, default_main_program, Variable
+import six
+from .framework import Program, default_main_program, Variable
 from . import core
 
 __all__ = [
@@ -204,19 +205,19 @@ def fetch_var(name, scope=None, return_numpy=True):
 
 
 def _get_program_cache_key(feed, fetch_list):
-    feed_var_names = feed.keys()
+    feed_var_names = list(feed.keys())
 
     def to_name_str(var):
         if isinstance(var, Variable):
             return var.desc.name()
         elif isinstance(var, str):
             return var
-        elif isinstance(var, basestring):
+        elif isinstance(var, six.string_types):
             return str(var)
         else:
             raise TypeError(str(var) + " should be Variable or str")
 
-    fetch_var_names = map(to_name_str, fetch_list)
+    fetch_var_names = list(map(to_name_str, fetch_list))
 
     return str(feed_var_names + fetch_var_names)
 
@@ -229,8 +230,8 @@ class Executor(object):
     to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
     the variables(or names) that user want to get after program run. Note: the executor will run all
     operators in the program but not only the operators dependent by the fetch_list.
-    It store the global variables into the global scope, and create a local scope for the temporary 
-    variables. The local scope contents will be discarded after every minibatch forward/backward finished. 
+    It store the global variables into the global scope, and create a local scope for the temporary
+    variables. The local scope contents will be discarded after every minibatch forward/backward finished.
     But the global scope variables will be persistent through different runs.
     All of ops in program will be running in sequence.
 
@@ -345,7 +346,7 @@ class Executor(object):
     def _fetch_data(self, fetch_list, fetch_var_name, scope):
         outs = [
             core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in xrange(len(fetch_list))
+            for i in range(len(fetch_list))
         ]
         return outs
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 9fc3849ee0..3111674abb 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -15,21 +15,22 @@
 import collections
 import contextlib
 import re
+import six
 
 import numpy as np
 
-import proto.framework_pb2 as framework_pb2
+from .proto import framework_pb2
 try:
     from . import core
-except ImportError, e:
+except ImportError as e:
     raise ImportError(
         """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
     if you encounters \"libmkldnn.so not found\" errors. If you have python
     installed in other directory, replace \"/usr/local/lib\" with your own
     directory. The original error is: \n""" + e.message)
-except Exception, e:
+except Exception as e:
     raise e
-import unique_name
+from . import unique_name
 
 __all__ = [
     'Program',
@@ -86,7 +87,7 @@ def convert_np_dtype_to_dtype_(np_dtype):
     elif dtype == np.uint8:
         return core.VarDesc.VarType.UINT8
     else:
-        raise ValueError("Not supported numpy dtype " + str(dtype))
+        raise ValueError("Not supported numpy dtype " + six.binary_type(dtype))
 
 
 def dtype_is_floating(dtype):
@@ -129,15 +130,15 @@ def _debug_string_(proto, throw_on_error=True):
 
 class Variable(object):
     """
-    In Fluid, every input and output of an operator is a variable. In most 
-    cases, variables are used for holding different kinds of data or training 
-    labels. A variable belongs to a block. All variable has its own name and 
+    In Fluid, every input and output of an operator is a variable. In most
+    cases, variables are used for holding different kinds of data or training
+    labels. A variable belongs to a block. All variable has its own name and
     two variables in different blocks could have the same name.
 
-    There are many kinds of variables. Each kind of them has its own attributes 
-    and usages. Please reference the framework.proto for details. 
+    There are many kinds of variables. Each kind of them has its own attributes
+    and usages. Please reference the framework.proto for details.
 
-    Most of a Variable's member variables can be setted to be None. It mean 
+    Most of a Variable's member variables can be setted to be None. It mean
     it is not available or will be specified later.
 
     Args:
@@ -197,6 +198,7 @@ class Variable(object):
         if name is None:
             name = unique_name.generate('_generated_var')
         is_new_var = False
+        name = name if isinstance(name, six.binary_type) else name.encode()
         self.desc = self.block.desc.find_var(name)
 
         if self.desc is None:
@@ -290,13 +292,13 @@ class Variable(object):
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
         protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.VarDesc.FromString(str(protostr))
+        proto = framework_pb2.VarDesc.FromString(six.binary_type(protostr))
         res_str = _debug_string_(proto, throw_on_error)
         if with_details:
             additional_attr = ("error_clip", "stop_gradient")
             for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         str(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name, six.binary_type(getattr(self, attr_name)))
         return res_str
 
     __repr__ = __str__
@@ -369,7 +371,7 @@ def get_all_op_protos():
     protostrs = core.get_all_op_protos()
     ret_values = []
     for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
         ret_values.append(op_proto)
     return ret_values
 
@@ -472,7 +474,6 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
-
         self.block = block
         self.desc = desc
         self.attrs = attrs
@@ -523,10 +524,19 @@ class Operator(object):
                             % (in_proto.name, len(in_args)))
                     in_arg_names = []
                     for arg in in_args:
-                        if isinstance(arg, basestring):
+                        if isinstance(arg, six.string_types):
                             in_arg_names.append(arg)
+                        elif isinstance(arg, six.binary_type):
+                            in_arg_names.append(arg.decode())
                         else:
-                            in_arg_names.append(arg.name)
+                            if isinstance(arg.name, six.string_types):
+                                in_arg_names.append(arg.name)
+                            elif isinstance(arg.name, six.binary_type):
+                                in_arg_names.append(arg.name.decode())
+                            else:
+                                raise TypeError(
+                                    "arguments require unicode, str or bytes, but get %s instead."
+                                    % (type(arg.name)))
                     self.desc.set_input(in_proto.name, in_arg_names)
                 else:
                     self.desc.set_input(in_proto.name, [])
@@ -541,8 +551,9 @@ class Operator(object):
             if not given == need:
                 raise ValueError(("Incorrect setting for output(s) of "
                                   "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type, ", ".join(str(e) for e in need),
-                                  ", ".join(str(e) for e in given)))
+                                 (type,
+                                  ", ".join(six.binary_type(e) for e in need),
+                                  ", ".join(six.binary_type(e) for e in given)))
 
             for out_proto in proto.outputs:
                 out_args = outputs[out_proto.name]
@@ -554,7 +565,14 @@ class Operator(object):
                         (out_proto.name, len(out_args)))
                 out_arg_names = []
                 for arg in out_args:
-                    out_arg_names.append(arg.name)
+                    if isinstance(arg.name, six.string_types):
+                        out_arg_names.append(arg.name)
+                    elif isinstance(arg.name, six.binary_type):
+                        out_arg_names.append(arg.name.decode())
+                    else:
+                        raise TypeError(
+                            "arguments require unicode, str or bytes, but get %s instead."
+                            % (type(arg.name)))
                     arg.op = self
                 self.desc.set_output(out_proto.name, out_arg_names)
 
@@ -590,7 +608,7 @@ class Operator(object):
 
         """
         protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.OpDesc.FromString(str(protostr))
+        proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
         return _debug_string_(proto, throw_on_error)
 
     def __str__(self):
@@ -845,7 +863,7 @@ class Block(object):
             re_add_indent = re.compile(r"\n(.)")
             res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
                 self.idx, self.parent_idx)
-            for var in self.vars.itervalues():
+            for var in list(self.vars.values()):
                 res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
                     r"\n    \1", var.to_string(throw_on_error, with_details))
             for op in self.ops:
@@ -854,7 +872,8 @@ class Block(object):
             res_str += "\n}"
         else:
             protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.BlockDesc.FromString(str(protostr))
+            proto = framework_pb2.BlockDesc.FromString(
+                six.binary_type(protostr))
             res_str = _debug_string_(proto, throw_on_error)
         return res_str
 
@@ -898,10 +917,11 @@ class Block(object):
         Returns:
             Variable: the Variable with the giving name.
         """
-        if not isinstance(name, basestring):
-            raise TypeError(
-                "var require string as parameter, but get %s instead." %
-                (type(name)))
+        if not isinstance(name, six.string_types):
+            if not isinstance(name, six.binary_type):
+                raise TypeError(
+                    "var require string as parameter, but get %s instead." %
+                    (type(name)))
         v = self.vars.get(name, None)
         if v is None:
             raise ValueError("var %s not in this block" % name)
@@ -949,10 +969,10 @@ class Block(object):
         raise ValueError("Var {0} is not found recursively".format(name))
 
     def all_parameters(self):
-        return list(self._iter_parameters())
+        return list(self.iter_parameters())
 
-    def _iter_parameters(self):
-        return (item[1] for item in self.vars.iteritems()
+    def iter_parameters(self):
+        return (item[1] for item in list(self.vars.items())
                 if isinstance(item[1], Parameter))
 
     def create_var(self, *args, **kwargs):
@@ -1038,7 +1058,26 @@ class Block(object):
         global_block = self.program.global_block()
         param = Parameter(global_block, *args, **kwargs)
         if 'initializer' in kwargs:
-            kwargs['initializer'](param, self)
+
+            def _is_inited_by(block, var):
+                init_ops = []
+                for op in block.ops:
+                    if var.name in op.output_arg_names:
+                        init_ops.append(op)
+                return init_ops
+
+            initializer = kwargs['initializer']
+            init_ops = _is_inited_by(global_block, param)
+            init_ops_len = len(init_ops)
+            if init_ops_len > 1:
+                raise RuntimeError("param " + param.name +
+                                   " is inited by multiple init ops " + str(
+                                       init_ops))
+            elif init_ops_len == 1:
+                #TODO already inited, do nothing, should log a warning
+                pass
+            else:
+                initializer(param, self)
         return param
 
     def append_op(self, *args, **kwargs):
@@ -1113,7 +1152,7 @@ class Block(object):
                 self.create_var(name=var.name(), desc=var, type=var.type())
 
         # sync variables removed from c++ end
-        for var in self.vars.keys():
+        for var in list(self.vars.keys()):
             if not self.desc.find_var(var):
                 self.vars.pop(var)
 
@@ -1185,7 +1224,7 @@ class Block(object):
         if not isinstance(other, Block):
             raise TypeError(
                 "_copy_param_info_from should be invoked with Block")
-        for p in other._iter_parameters():
+        for p in other.iter_parameters():
             assert isinstance(p, Parameter)
             v = self.vars.get(p.name, None)
             if v is None:
@@ -1385,7 +1424,8 @@ class Program(object):
                 res_str += block.to_string(throw_on_error, with_details)
         else:
             protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.ProgramDesc.FromString(str(protostr))
+            proto = framework_pb2.ProgramDesc.FromString(
+                six.binary_type(protostr))
             res_str = _debug_string_(proto, throw_on_error)
         return res_str
 
@@ -1483,7 +1523,7 @@ class Program(object):
         else:
             p = Program()
             p.desc = core.ProgramDesc(self.desc)
-            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+            p.blocks = [Block(p, i) for i in range(self.desc.num_blocks())]
             p._sync_with_cpp()
 
         p._copy_param_info_from(self)
@@ -1535,13 +1575,18 @@ class Program(object):
             targets_idx.append([t.block.idx, t.idx])
         res = Program()
         res.desc = core.prune(self.desc, targets_idx)
-        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
         res._sync_with_cpp()
         return res
 
     def inference_optimize(self):
         """
-        This method will create a new program and change the :code:`is_test`
+        This method will create a new program and do following adjustments on it:
+        1. Remove all reader variables and their creator ops if exist.
+
+        2. Remove the :code:`read_op` if exists.
+
+        3. change the :code:`is_test`
         attribute of operators to :code:`True`. All the :code:`Parameter`
         information will be lost.
 
@@ -1555,13 +1600,29 @@ class Program(object):
         # core.inference_optimize being fixed.
         res = Program()
         res.desc = core.ProgramDesc(self.desc)
-        for i in xrange(res.desc.num_blocks()):
+
+        # remove all readers and the read_op if exist
+        read_op_idx = 0
+        root_block = res.desc.block(0)
+        while True:
+            if read_op_idx >= root_block.op_size() or root_block.op(
+                    read_op_idx).type() == 'read':
+                break
+            read_op_idx += 1
+        if read_op_idx < root_block.op_size():
+            root_block._remove_op(0, read_op_idx + 1)
+        for var in root_block.all_vars():
+            if var.type() == core.VarDesc.VarType.READER:
+                root_block._remove_var(var.name())
+
+        # change all `is_test` attributes to True
+        for i in range(res.desc.num_blocks()):
             block = res.desc.block(i)
-            for j in xrange(block.op_size()):
+            for j in range(block.op_size()):
                 op = block.op(j)
                 if op.has_attr('is_test'):
                     op.set_attr('is_test', True)
-        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
         res._sync_with_cpp()
         return res
 
@@ -1574,14 +1635,14 @@ class Program(object):
         and deserialization.
 
         Args:
-            binary_str(str): The binary prootbuf string.
+            binary_str_type(str): The binary prootbuf string.
 
         Returns:
             Program: A deserialized program desc.
         """
         p = Program()
         p.desc = core.ProgramDesc(binary_str)
-        p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
+        p.blocks = [Block(p, i) for i in range(p.desc.num_blocks())]
         p._sync_with_cpp()
         return p
 
@@ -1609,7 +1670,7 @@ class Program(object):
         self._seed = seed
 
     def __repr__(self):
-        return str(self)
+        return self.__str__()
 
     def global_block(self):
         """
@@ -1720,7 +1781,7 @@ class Program(object):
         if len(self.blocks) != len(other.blocks):
             raise ValueError("_copy_param_info_from should be invoked with two "
                              "program, with represent the same topology")
-        for var in other.global_block().vars.itervalues():
+        for var in list(other.global_block().vars.values()):
             if var.is_data:
                 self.global_block().var(var.name).is_data = True
 
@@ -1732,15 +1793,15 @@ class Program(object):
             iterable: The generator will yield every variable in this program.
         """
         for each_block in self.blocks:
-            for each_var in each_block.vars.itervalues():
+            for each_var in list(each_block.vars.values()):
                 yield each_var
 
 
 class Parameter(Variable):
     """
-    Parameter is derived from Variable. A parameter is a persistable 
+    Parameter is derived from Variable. A parameter is a persistable
     Variable, and will be updated by optimizers after each iteration.
-    The training of a neural network is essentially the updating of 
+    The training of a neural network is essentially the updating of
     its parameters.
 
     Relative to a general Variable, a Parameter has several its own
@@ -1806,8 +1867,8 @@ class Parameter(Variable):
             additional_attr = ("trainable", "optimize_attr", "regularizer",
                                "gradient_clip_attr", "do_model_average")
             for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         str(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name, six.binary_type(getattr(self, attr_name)))
         else:
             res_str = Variable.to_string(self, throw_on_error, False)
         return res_str
diff --git a/python/paddle/fluid/graphviz.py b/python/paddle/fluid/graphviz.py
index 125b4efa9d..ba67bf5ae6 100644
--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -14,12 +14,13 @@
 
 import os
 import random
+import six
 import subprocess
 import logging
 
 
 def crepr(v):
-    if type(v) is str or type(v) is unicode:
+    if isinstance(v, six.string_types):
         return '"%s"' % v
     return str(v)
 
@@ -104,7 +105,7 @@ class Graph(object):
 
     def _rank_repr(self):
         ranks = sorted(
-            self.rank_groups.items(),
+            list(self.rank_groups.items()),
             cmp=lambda a, b: a[1].priority > b[1].priority)
         repr = []
         for x in ranks:
@@ -148,7 +149,7 @@ class Node(object):
             name=self.name,
             label=self.label,
             extra=',' + ','.join("%s=%s" % (key, crepr(value))
-                                 for key, value in self.attrs.items())
+                                 for key, value in list(self.attrs.items()))
             if self.attrs else "")
         return reprs
 
@@ -172,7 +173,7 @@ class Edge(object):
             target=self.target.name,
             extra="" if not self.attrs else
             "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
-                           for attr in self.attrs.items()) + "]")
+                           for attr in list(self.attrs.items())) + "]")
         return repr
 
 
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index a81e39695b..ff382d8b83 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -14,14 +14,14 @@
 
 import contextlib
 
-import core
-
-import executor
-import framework
-import io
-import parallel_executor
-import unique_name
-from trainer import check_and_get_place
+from . import core
+
+from . import executor
+from . import framework
+from . import io
+from . import parallel_executor
+from . import unique_name
+from .trainer import check_and_get_place
 
 __all__ = ['Inferencer', ]
 
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 0e640bf280..83290ac608 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import framework
+from . import framework
 import numpy as np
 import contextlib
-from framework import convert_np_dtype_to_dtype_
-from core import VarDesc
+from .framework import convert_np_dtype_to_dtype_
+from .core import VarDesc
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index f36c0a7b0a..80b258c9c9 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -16,6 +16,7 @@ import os
 import errno
 import time
 import shutil
+import six
 
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable
@@ -92,34 +93,34 @@ def save_vars(executor,
     """
     Save variables to the given directory by executor.
 
-    There are two ways to specify variables to be saved: The first way, list 
-    variables in a list and assign it to the `vars`. The second way, assign the 
-    `main_program` with an existing program, then all variables in the program 
-    will be saved. The first way has a higher priority. In other words, if `vars` 
+    There are two ways to specify variables to be saved: The first way, list
+    variables in a list and assign it to the `vars`. The second way, assign the
+    `main_program` with an existing program, then all variables in the program
+    will be saved. The first way has a higher priority. In other words, if `vars`
     are assigned, the `main_program` and the `predicate` will be ignored.
 
-    The `dirname` are used to specify the folder where to save variables. 
-    If you prefer to save variables in separate files in the folder `dirname`, 
-    set `filename` None; if you prefer to save all variables in a single file, 
+    The `dirname` are used to specify the folder where to save variables.
+    If you prefer to save variables in separate files in the folder `dirname`,
+    set `filename` None; if you prefer to save all variables in a single file,
     use `filename` to specify it.
 
     Args:
         executor(Executor): The executor to run for saving variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose variables will be saved. 
-                                    If it is None, the default main program will 
+        main_program(Program|None): The program whose variables will be saved.
+                                    If it is None, the default main program will
                                     be used automatically.
                                     Default: None
-        vars(list[Variable]|None): The list that contains all variables to save. 
+        vars(list[Variable]|None): The list that contains all variables to save.
                                    It has a higher priority than the `main_program`.
                                    Default: None
-        predicate(function|None): If it is not None, only variables in the 
-                                  `main_program` that makes predicate(variable)==True 
-                                  will be saved. It only works when we are using the 
-                                  `main_program` to specify variables (In other words 
+        predicate(function|None): If it is not None, only variables in the
+                                  `main_program` that makes predicate(variable)==True
+                                  will be saved. It only works when we are using the
+                                  `main_program` to specify variables (In other words
                                   `vars` is None).
                                   Default: None
-        filename(str|None): The file which to save all variables. If you prefer to save 
+        filename(str|None): The file which to save all variables. If you prefer to save
                             variables separately, set it to None.
                             Default: None
 
@@ -149,7 +150,7 @@ def save_vars(executor,
 
             # The second usage: using `vars` to specify variables
             var_list = [var_a, var_b, var_c]
-            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list, 
+            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list,
                                filename="vars_file")
             # var_a, var_b and var_c will be saved. And they are going to be
             # saved in the same file named 'var_file' in the path "./my_paddle_model".
@@ -163,7 +164,7 @@ def save_vars(executor,
         save_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()),
+            vars=list(filter(predicate, main_program.list_vars())),
             filename=filename)
     else:
         save_program = Program()
@@ -203,14 +204,14 @@ def save_params(executor, dirname, main_program=None, filename=None):
     This function filters out all parameters from the give `main_program`
     and then save them to the folder `dirname` or the file `filename`.
 
-    Use the `dirname` to specify the saving folder. If you would like to 
-    save parameters in separate files, set `filename` None; if you would 
-    like to save all parameters in a single file, use `filename` to specify 
+    Use the `dirname` to specify the saving folder. If you would like to
+    save parameters in separate files, set `filename` None; if you would
+    like to save all parameters in a single file, use `filename` to specify
     the file name.
 
-    NOTICE: Some variables are not Parameter while they are necessary for 
-    training. So you can NOT save and continue your training just by 
-    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    NOTICE: Some variables are not Parameter while they are necessary for
+    training. So you can NOT save and continue your training just by
+    `save_params()` and `load_params()`. Please use `save_persistables()`
     and `load_persistables()` instead.
 
     Args:
@@ -220,8 +221,8 @@ def save_params(executor, dirname, main_program=None, filename=None):
                                     saved. If it is None, the default
                                     main program will be used automatically.
                                     Default: None
-        filename(str|None): The file to save all parameters. If you prefer 
-                            to save parameters in differnet files, set it 
+        filename(str|None): The file to save all parameters. If you prefer
+                            to save parameters in differnet files, set it
                             to None.
                             Default: None
 
@@ -234,7 +235,7 @@ def save_params(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.save_params(executor=exe, dirname=param_path, 
+            fluid.io.save_params(executor=exe, dirname=param_path,
                                  main_program=None)
     """
     save_vars(
@@ -248,23 +249,23 @@ def save_params(executor, dirname, main_program=None, filename=None):
 
 def save_persistables(executor, dirname, main_program=None, filename=None):
     """
-    This function filters out all variables with `persistable==True` from the 
-    give `main_program` and then saves these variables to the folder `dirname` 
+    This function filters out all variables with `persistable==True` from the
+    give `main_program` and then saves these variables to the folder `dirname`
     or file `filename`.
 
-    The `dirname` is used to specify the folder where persistable variables 
-    are going to be saved. If you would like to save variables in separate 
-    files, set `filename` None; if you would like to save all variables in a 
+    The `dirname` is used to specify the folder where persistable variables
+    are going to be saved. If you would like to save variables in separate
+    files, set `filename` None; if you would like to save all variables in a
     single file, use `filename` to specify the file name.
 
     Args:
         executor(Executor): The executor to run for saving persistable variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose persistbale variables will 
-                                    be saved. If it is None, the default main 
+        main_program(Program|None): The program whose persistbale variables will
+                                    be saved. If it is None, the default main
                                     program will be used automatically.
                                     Default: None
-        filename(str|None): The file to saved all variables. If you prefer to 
+        filename(str|None): The file to saved all variables. If you prefer to
                             save variables in differnet files, set it to None.
                             Default: None
 
@@ -277,7 +278,7 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.save_persistables(executor=exe, dirname=param_path, 
+            fluid.io.save_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
     save_vars(
@@ -298,34 +299,34 @@ def load_vars(executor,
     """
     Load variables from the given directory by executor.
 
-    There are two ways to specify variables to be loaded: The first way, list 
-    variables in a list and assign it to the `vars`. The second way, assign the 
-    `main_program` with an existing program, then all variables in the program 
-    will be loaded. The first way has a higher priority. In other words if `vars` 
+    There are two ways to specify variables to be loaded: The first way, list
+    variables in a list and assign it to the `vars`. The second way, assign the
+    `main_program` with an existing program, then all variables in the program
+    will be loaded. The first way has a higher priority. In other words if `vars`
     are assigned, the `main_program` and the `predicate` will be ignored.
 
-    The `dirname` are used to specify the folder where to load variables. 
-    If variables were saved in separate files in the folder `dirname`, 
-    set `filename` None; if all variables were saved in a single file, 
+    The `dirname` are used to specify the folder where to load variables.
+    If variables were saved in separate files in the folder `dirname`,
+    set `filename` None; if all variables were saved in a single file,
     use `filename` to specify it.
 
     Args:
         executor(Executor): The executor to run for loading variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose variables will be loaded. 
-                                    If it is None, the default main program will 
+        main_program(Program|None): The program whose variables will be loaded.
+                                    If it is None, the default main program will
                                     be used automatically.
                                     Default: None
-        vars(list[Variable]|None): The list that contains all variables to load. 
+        vars(list[Variable]|None): The list that contains all variables to load.
                                    It has a higher priority than the `main_program`.
                                    Default: None
-        predicate(function|None): If it is not None, only variables in the 
-                                  `main_program` that makes predicate(variable)==True 
-                                  will be loaded. It only works when we are using the 
-                                  `main_program` to specify variables (In other words 
+        predicate(function|None): If it is not None, only variables in the
+                                  `main_program` that makes predicate(variable)==True
+                                  will be loaded. It only works when we are using the
+                                  `main_program` to specify variables (In other words
                                   `vars` is None).
                                   Default: None
-        filename(str|None): The file which saved all required variables. If variables 
+        filename(str|None): The file which saved all required variables. If variables
                             were saved in differnet files, set it to None.
                             Default: None
 
@@ -355,9 +356,9 @@ def load_vars(executor,
 
             # The second usage: using `vars` to specify variables
             var_list = [var_a, var_b, var_c]
-            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list, 
+            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list,
                                filename="vars_file")
-            # var_a, var_b and var_c will be loaded. And they are supposed to haven 
+            # var_a, var_b and var_c will be loaded. And they are supposed to haven
             # been saved in the same file named 'var_file' in the path "./my_paddle_model".
     """
     if vars is None:
@@ -369,8 +370,7 @@ def load_vars(executor,
         load_vars(
             executor,
             dirname=dirname,
-            main_program=main_program,
-            vars=filter(predicate, main_program.list_vars()),
+            vars=list(filter(predicate, main_program.list_vars())),
             filename=filename)
     else:
         load_prog = Program()
@@ -414,15 +414,15 @@ def load_params(executor, dirname, main_program=None, filename=None):
     and then trys to load these parameters from the folder `dirname` or
     the file `filename`.
 
-    Use the `dirname` to specify the folder where parameters were saved. If 
-    parameters were saved in separate files in the folder `dirname`, set 
-    `filename` None; if all parameters were saved in a single file, use 
+    Use the `dirname` to specify the folder where parameters were saved. If
+    parameters were saved in separate files in the folder `dirname`, set
+    `filename` None; if all parameters were saved in a single file, use
     `filename` to specify the file name.
 
-    NOTICE: Some variables are not Parameter while they are necessary for 
-    training. So you can NOT save and continue your training just by 
-    `save_params()` and `load_params()`. Please use `save_persistables()` 
-    and `load_persistables()` instead. 
+    NOTICE: Some variables are not Parameter while they are necessary for
+    training. So you can NOT save and continue your training just by
+    `save_params()` and `load_params()`. Please use `save_persistables()`
+    and `load_persistables()` instead.
 
     Args:
         executor(Executor): The executor to run for loading parameters.
@@ -431,7 +431,7 @@ def load_params(executor, dirname, main_program=None, filename=None):
                                     loaded. If it is None, the default
                                     main program will be used automatically.
                                     Default: None
-        filename(str|None): The file which saved all parameters. If parameters 
+        filename(str|None): The file which saved all parameters. If parameters
                             were saved in differnet files, set it to None.
                             Default: None
 
@@ -444,7 +444,7 @@ def load_params(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.load_params(executor=exe, dirname=param_path, 
+            fluid.io.load_params(executor=exe, dirname=param_path,
                                 main_program=None)
     """
     load_vars(
@@ -457,23 +457,23 @@ def load_params(executor, dirname, main_program=None, filename=None):
 
 def load_persistables(executor, dirname, main_program=None, filename=None):
     """
-    This function filters out all variables with `persistable==True` from the 
-    give `main_program` and then trys to load these variables from the folder 
+    This function filters out all variables with `persistable==True` from the
+    give `main_program` and then trys to load these variables from the folder
     `dirname` or the file `filename`.
 
-    Use the `dirname` to specify the folder where persistable variables were 
-    saved. If variables were saved in separate files, set `filename` None; 
-    if all variables were saved in a single file, use `filename` to specify 
+    Use the `dirname` to specify the folder where persistable variables were
+    saved. If variables were saved in separate files, set `filename` None;
+    if all variables were saved in a single file, use `filename` to specify
     the file name.
 
     Args:
         executor(Executor): The executor to run for loading persistable variables.
         dirname(str): The directory path.
-        main_program(Program|None): The program whose persistbale variables will 
-                                    be loaded. If it is None, the default main 
+        main_program(Program|None): The program whose persistbale variables will
+                                    be loaded. If it is None, the default main
                                     program will be used automatically.
                                     Default: None
-        filename(str|None): The file which saved all variables. If variables were 
+        filename(str|None): The file which saved all variables. If variables were
                             saved in differnet files, set it to None.
                             Default: None
 
@@ -486,7 +486,7 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
             exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
             prog = fluid.default_main_program()
-            fluid.io.load_persistables(executor=exe, dirname=param_path, 
+            fluid.io.load_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
     load_vars(
@@ -565,20 +565,20 @@ def save_inference_model(dirname,
 
     Args:
         dirname(str): The directory path to save the inference model.
-        feeded_var_names(list[str]): Names of variables that need to be feeded data 
+        feeded_var_names(list[str]): Names of variables that need to be feeded data
                                      during inference.
-        target_vars(list[Variable]): Variables from which we can get inference 
+        target_vars(list[Variable]): Variables from which we can get inference
                                      results.
         executor(Executor): The executor that saves the inference model.
-        main_program(Program|None): The original program, which will be pruned to 
-                                    build the inference model. If is setted None, 
+        main_program(Program|None): The original program, which will be pruned to
+                                    build the inference model. If is setted None,
                                     the default main program will be used.
                                     Default: None.
-        model_filename(str|None): The name of file to save the inference program 
-                                  itself. If is setted None, a default filename 
+        model_filename(str|None): The name of file to save the inference program
+                                  itself. If is setted None, a default filename
                                   `__model__` will be used.
-        params_filename(str|None): The name of file to save all related parameters. 
-                                   If it is setted None, parameters will be saved 
+        params_filename(str|None): The name of file to save all related parameters.
+                                   If it is setted None, parameters will be saved
                                    in separate files .
 
     Returns:
@@ -596,20 +596,32 @@ def save_inference_model(dirname,
             fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
                          target_vars=[predict_var], executor=exe)
 
-            # In this exsample, the function will prune the default main program 
-            # to make it suitable for infering the `predict_var`. The pruned 
-            # inference program is going to be saved in the "./infer_model/__model__" 
+            # In this exsample, the function will prune the default main program
+            # to make it suitable for infering the `predict_var`. The pruned
+            # inference program is going to be saved in the "./infer_model/__model__"
             # and parameters are going to be saved in separate files under folder
-            # "./infer_model". 
+            # "./infer_model".
 
     """
-    if isinstance(feeded_var_names, basestring):
+    if isinstance(feeded_var_names, six.binary_type):
         feeded_var_names = [feeded_var_names]
+    elif isinstance(feeded_var_names, six.text_type):
+        feeded_var_names = [feeded_var_names.encode()]
     else:
         if len(feeded_var_names) > 0:
+            # TODO(paddle-dev): polish these code blocks
             if not (bool(feeded_var_names) and all(
-                    isinstance(name, basestring) for name in feeded_var_names)):
-                raise ValueError("'feed_var_names' should be a list of str.")
+                    isinstance(name, six.binary_type)
+                    for name in feeded_var_names)):
+                if not (all(
+                        isinstance(name, six.text_type)
+                        for name in feeded_var_names)):
+                    raise ValueError(
+                        "'feed_var_names' should be a list of str.")
+                else:
+                    feeded_var_names = [
+                        name.encode() for name in feeded_var_names
+                    ]
 
     if isinstance(target_vars, Variable):
         target_vars = [target_vars]
@@ -666,22 +678,22 @@ def load_inference_model(dirname,
         dirname(str): The directory path
         executor(Executor): The executor to run for loading inference model.
         model_filename(str|None): The name of file to load inference program.
-                                  If it is None, the default filename 
+                                  If it is None, the default filename
                                   '__model__' will be used.
                                   Default: None
         params_filename(str|None): The name of file to load all parameters.
-                                   It is only used for the case that all 
-                                   parameters were saved in a single binary 
-                                   file. If parameters were saved in separate 
+                                   It is only used for the case that all
+                                   parameters were saved in a single binary
+                                   file. If parameters were saved in separate
                                    files, set it as 'None'.
 
     Returns:
         tuple: The return of this function is a tuple with three elements:
-        (program, feed_target_names, fetch_targets). The `program` is a 
-        Program, it's the program for inference. The `feed_target_names` is 
-        a list of str, it contains Names of variables that need to feed 
-        data in the inference program. The `fetch_targets` is a list of 
-        Variable. It contains variables from which we can get inference 
+        (program, feed_target_names, fetch_targets). The `program` is a
+        Program, it's the program for inference. The `feed_target_names` is
+        a list of str, it contains Names of variables that need to feed
+        data in the inference program. The `fetch_targets` is a list of
+        Variable. It contains variables from which we can get inference
         results.
 
     Raises:
@@ -692,17 +704,17 @@ def load_inference_model(dirname,
 
             exe = fluid.Executor(fluid.CPUPlace())
             path = "./infer_model"
-            [inference_program, feed_target_names, fetch_targets] = 
+            [inference_program, feed_target_names, fetch_targets] =
                 fluid.io.load_inference_model(dirname=path, executor=exe)
             results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
 
-            # In this exsample, the inference program was saved in the 
-            # "./infer_model/__model__" and parameters were saved in 
-            # separate files in ""./infer_model". 
-            # After getting inference program, feed target names and 
-            # fetch targets, we can use an Executor to run the inference 
+            # In this exsample, the inference program was saved in the
+            # "./infer_model/__model__" and parameters were saved in
+            # separate files in ""./infer_model".
+            # After getting inference program, feed target names and
+            # fetch targets, we can use an Executor to run the inference
             # program to get the inference result.
 
     """
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index de752d1dae..0c2b1eb795 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -14,12 +14,14 @@
 
 import copy
 import itertools
+import six
 
-from framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
-import unique_name
+from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
+from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
-from param_attr import ParamAttr, WeightNormParamAttr
-import core
+from .param_attr import ParamAttr, WeightNormParamAttr
+from . import core
+from six.moves import zip
 
 
 class LayerHelper(object):
@@ -83,7 +85,7 @@ class LayerHelper(object):
             raise ValueError("parameter number mismatch")
         elif len(param_attr) == 1 and length != 1:
             tmp = [None] * length
-            for i in xrange(length):
+            for i in range(length):
                 tmp[i] = copy.deepcopy(param_attr[0])
             param_attr = tmp
         return param_attr
@@ -91,7 +93,7 @@ class LayerHelper(object):
     def iter_inputs_and_params(self, input_param_name='input'):
         inputs = self.multiple_input(input_param_name)
         param_attrs = self.multiple_param_attr(len(inputs))
-        for ipt, param_attr in itertools.izip(inputs, param_attrs):
+        for ipt, param_attr in zip(inputs, param_attrs):
             yield ipt, param_attr
 
     def input_dtype(self, input_param_name='input'):
@@ -218,7 +220,7 @@ class LayerHelper(object):
                 norm = __norm_op(reshape, dim=0, block=block)
                 __reshape_op(norm, out=out, shape=out_shape, block=block)
             else:
-                perm = range(len(x.shape))
+                perm = list(range(len(x.shape)))
                 perm[0], perm[dim] = dim, 0
                 transpose = __transpose_op(x, perm, block=block)
                 norm = __norm_op(transpose, dim=0, block=block)
@@ -397,8 +399,10 @@ class LayerHelper(object):
         act = self.kwargs.get('act', None)
         if act is None:
             return input_var
-        if isinstance(act, basestring):
+        if isinstance(act, six.string_types):
             act = {'type': act}
+        else:
+            raise TypeError(str(act) + " should be unicode or str")
 
         if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
             act['use_cudnn'] = self.kwargs.get('use_cudnn')
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index 4917e67de0..a48e360463 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -12,25 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import ops
-from ops import *
-import nn
-from nn import *
-import io
-from io import *
-import tensor
-from tensor import *
-import control_flow
-from control_flow import *
-import device
-from device import *
-import math_op_patch
-from math_op_patch import *
-import detection
-from detection import *
-import metric_op
-from metric_op import *
-from learning_rate_scheduler import *
+from . import ops
+from .ops import *
+from . import nn
+from .nn import *
+from . import io
+from .io import *
+from . import tensor
+from .tensor import *
+from . import control_flow
+from .control_flow import *
+from . import device
+from .device import *
+from . import math_op_patch
+from .math_op_patch import *
+from . import detection
+from .detection import *
+from . import metric_op
+from .metric_op import *
+from .learning_rate_scheduler import *
 
 __all__ = []
 __all__ += nn.__all__
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index f05ae6d5d1..9fb7b4d0ca 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,14 +13,16 @@
 # limitations under the License.
 import contextlib
 
-from layer_function_generator import autodoc, templatedoc
-from tensor import assign, fill_constant
+from .layer_function_generator import autodoc, templatedoc
+from .tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
-from ops import logical_and, logical_not, logical_or
+from .ops import logical_and, logical_not, logical_or
 import numpy
+import warnings
+from functools import reduce
 
 __all__ = [
     'While',
@@ -275,11 +277,14 @@ class ParallelDo(object):
           avg_cost = fluid.layers.mean(x=cost)
 
     .. warning::
-    
+
        It will be soon deprecated, please use ParallelExecutor instead.
     """
 
     def __init__(self, places, use_nccl=False, name=None):
+        warnings.warn(
+            "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.",
+            Warning)
         self.helper = LayerHelper("parallel_do", name=name)
         self.inputs = []
         self.places = places
@@ -338,7 +343,7 @@ class ParallelDo(object):
 
         return [parent_block.var(name) for name in params]
 
-    def complete_op(self):
+    def _complete_op(self):
         main_program = self.helper.main_program
         current_block = main_program.current_block()
         parent_block = self.parent_block()
@@ -394,7 +399,7 @@ class BlockGuardWithCompletion(BlockGuard):
         if exc_type is not None:
             return False
         self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
-        self.rnn.complete_op()
+        self.rnn._complete_op()
         return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
                                                               exc_tb)
 
@@ -470,7 +475,7 @@ class StaticRNN(object):
             if shape is None or batch_ref is None:
                 raise ValueError(
                     "if init is None, memory at least need shape and batch_ref")
-            parent_block = self.parent_block()
+            parent_block = self._parent_block()
             var_name = unique_name.generate("@".join(
                 [self.helper.name, "memory_boot"]))
             boot_var = parent_block.create_var(
@@ -527,7 +532,7 @@ class StaticRNN(object):
             outputs={'Out': tmp_o},
             attrs={'dtype': o.dtype})
 
-        out_var = self.parent_block().create_var(
+        out_var = self._parent_block().create_var(
             name=tmp_o.name,
             shape=[self.seq_len] + list(tmp_o.shape),
             dtype=tmp_o.dtype)
@@ -543,7 +548,7 @@ class StaticRNN(object):
             raise TypeError("update memory should take variables")
         self.memories[mem.name].mem = var
 
-    def parent_block(self):
+    def _parent_block(self):
         prog = self.helper.main_program
         parent_idx = prog.current_block().parent_idx
         assert parent_idx >= 0
@@ -560,10 +565,10 @@ class StaticRNN(object):
         else:
             return self.outputs
 
-    def complete_op(self):
+    def _complete_op(self):
         main_program = self.helper.main_program
         rnn_block = main_program.current_block()
-        parent_block = self.parent_block()
+        parent_block = self._parent_block()
 
         local_inputs = set()
 
@@ -597,7 +602,7 @@ class StaticRNN(object):
         boot_memories = []
         pre_memories = []
         memories = []
-        for _, mem in self.memories.iteritems():
+        for _, mem in list(self.memories.items()):
             boot_memories.append(mem.init)
             pre_memories.append(mem.pre_mem.name)
             mem_var = rnn_block.var(mem.mem.name)
@@ -643,7 +648,7 @@ class WhileGuard(BlockGuard):
         if exc_type is not None:
             return False
         self.while_op.status = While.AFTER_WHILE_BLOCK
-        self.while_op.complete()
+        self.while_op._complete()
         return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
 
 
@@ -690,7 +695,7 @@ class While(object):
     def block(self):
         return WhileGuard(self)
 
-    def complete(self):
+    def _complete(self):
         main_program = self.helper.main_program
         while_block = main_program.current_block()
         parent_block = main_program.block(main_program.current_block()
@@ -815,21 +820,21 @@ def max_sequence_len(rank_table):
 
 
 def lod_tensor_to_array(x, table):
-    """ 
+    """
     Convert a LoDTensor to a LoDTensorArray.
 
-    This function split a LoDTesnor to a LoDTensorArray according to its LoD 
-    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
-    PaddlePaddle. The generated LoDTensorArray of this function can be further read 
-    or written by `read_from_array()` and `write_to_array()` operators. However, 
-    this function is generally an internal component of PaddlePaddle `DynamicRNN`. 
+    This function split a LoDTesnor to a LoDTensorArray according to its LoD
+    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in
+    PaddlePaddle. The generated LoDTensorArray of this function can be further read
+    or written by `read_from_array()` and `write_to_array()` operators. However,
+    this function is generally an internal component of PaddlePaddle `DynamicRNN`.
     Users should not use it directly.
 
     Args:
         x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
         table (ParamAttr|list): The variable that stores the level of lod
                                 which is ordered by sequence length in
-                                descending order. It is generally generated 
+                                descending order. It is generally generated
                                 by `layers.lod_rank_table()` API.
 
     Returns:
@@ -1063,9 +1068,9 @@ def array_read(array, i):
         Given:
 
         array = [0.6, 0.1, 0.3, 0.1]
-        
+
         And:
-        
+
         i = 2
 
         Then:
@@ -1172,9 +1177,9 @@ def array_length(array):
 
 class ConditionalBlockGuard(BlockGuard):
     """
-    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for 
-    holding a ConditionalBlock, and helping users entering and exiting the 
-    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard 
+    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for
+    holding a ConditionalBlock, and helping users entering and exiting the
+    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard
     is generally an internal component of IfElse, users should not use it directly.
     """
 
@@ -1508,7 +1513,7 @@ class IfElse(object):
     def __call__(self):
         if self.status != self.OUT_IF_ELSE_BLOCKS:
             raise ValueError("IfElse::__call__ must be out of sub-block")
-        false_len, true_len = map(len, self.output_table)
+        false_len, true_len = list(map(len, self.output_table))
         if false_len == 0 and true_len == 0:
             raise ValueError("Must invoke true_block/false_block before "
                              "__call__")
@@ -1928,7 +1933,7 @@ def is_empty(x, cond=None, **ignored):
 
     Args:
         x (Variable): The Variable to be tested.
-        cond (Variable|None): Output parameter. Returns the test result 
+        cond (Variable|None): Output parameter. Returns the test result
                               of given 'x'. Default: None
 
     Returns:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3ef4afa691..9fae96d9bc 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -15,12 +15,13 @@
 All layers just related to the detection neural network.
 """
 
-from layer_function_generator import generate_layer_fn
-from layer_function_generator import autodoc, templatedoc
+from .layer_function_generator import generate_layer_fn
+from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
-import tensor
-import nn
+from . import tensor
+from . import nn
 import math
+from functools import reduce
 
 __all__ = [
     'prior_box',
@@ -37,6 +38,7 @@ __all__ = [
 __auto__ = [
     'iou_similarity',
     'box_coder',
+    'polygon_box_transform',
 ]
 
 __all__ += __auto__
@@ -1031,7 +1033,7 @@ def multi_box_head(inputs,
         min_sizes = []
         max_sizes = []
         step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
-        for ratio in xrange(min_ratio, max_ratio + 1, step):
+        for ratio in range(min_ratio, max_ratio + 1, step):
             min_sizes.append(base_size * ratio / 100.)
             max_sizes.append(base_size * (ratio + step) / 100.)
         min_sizes = [base_size * .10] + min_sizes
diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py
index 384d302a70..bb1fb7fd57 100644
--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -15,7 +15,7 @@
 All util layers.
 """
 
-from layer_function_generator import autodoc
+from .layer_function_generator import autodoc
 from ..framework import unique_name
 from ..layer_helper import LayerHelper
 from ..annotations import deprecated
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index df6becabd1..327ae30981 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -16,8 +16,8 @@ import multiprocessing
 import threading
 
 from ..data_feeder import DataFeeder
-from control_flow import BlockGuard
-from layer_function_generator import templatedoc
+from .control_flow import BlockGuard
+from .layer_function_generator import templatedoc
 from .. import core
 from ..executor import global_scope
 from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
@@ -69,7 +69,7 @@ def data(name,
     """
     helper = LayerHelper('data', **locals())
     shape = list(shape)
-    for i in xrange(len(shape)):
+    for i in range(len(shape)):
         if shape[i] is None:
             shape[i] = -1
             append_batch_size = False
@@ -387,9 +387,9 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     Create a uniform random data generator
 
     This layer returns a Reader Variable.
-    Instead of opening a file and reading data from it, this 
-    Reader Variable generates float uniform random data by itself. 
-    It can be used as a dummy reader to test a network without 
+    Instead of opening a file and reading data from it, this
+    Reader Variable generates float uniform random data by itself.
+    It can be used as a dummy reader to test a network without
     opening a real file.
 
     Args:
@@ -443,9 +443,6 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                       startup_var)
 
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
     return monkey_patch_reader_methods(main_prog_var)
 
 
@@ -710,9 +707,9 @@ def open_files(filenames,
     """
     Open files
 
-    This layer takes a list of files to read from and returns a Reader Variable. 
-    Via the Reader Variable, we can get data from given files. All files must 
-    have name suffixs to indicate their formats, e.g., '*.recordio'. 
+    This layer takes a list of files to read from and returns a Reader Variable.
+    Via the Reader Variable, we can get data from given files. All files must
+    have name suffixs to indicate their formats, e.g., '*.recordio'.
 
     Args:
        filenames(list): The list of file names.
@@ -828,9 +825,9 @@ def shuffle(reader, buffer_size):
 
 def batch(reader, batch_size):
     """
-    This layer is a reader decorator. It takes a reader and adds 
-    'batching' decoration on it. When reading with the result 
-    decorated reader, output data will be automatically organized 
+    This layer is a reader decorator. It takes a reader and adds
+    'batching' decoration on it. When reading with the result
+    decorated reader, output data will be automatically organized
     to the form of batches.
 
     Args:
@@ -855,11 +852,11 @@ def batch(reader, batch_size):
             # If we read data with the raw_reader:
             #     data = fluid.layers.read_file(raw_reader)
             # We can only get data instance by instance.
-            # 
+            #
             # However, if we read data with the batch_reader:
             #     data = fluid.layers.read_file(batch_reader)
-            # Each 5 adjacent instances will be automatically combined together 
-            # to become a batch. So what we get('data') is a batch data instead 
+            # Each 5 adjacent instances will be automatically combined together
+            # to become a batch. So what we get('data') is a batch data instead
             # of an instance.
     """
     return __create_unshared_decorated_reader__(
@@ -906,8 +903,8 @@ def read_file(reader):
     """
     Execute the given reader and get data via it.
 
-    A reader is also a Variable. It can be a raw reader generated by 
-    `fluid.layers.open_files()` or a decorated one generated by 
+    A reader is also a Variable. It can be a raw reader generated by
+    `fluid.layers.open_files()` or a decorated one generated by
     `fluid.layers.double_buffer()` and so on.
 
     Args:
@@ -1008,7 +1005,7 @@ class Preprocessor(object):
         source_lod_levels = self.underlying_reader.desc.lod_levels()
         self.source_var_names = [
             unique_name("preprocessor_source")
-            for _ in xrange(len(source_shapes))
+            for _ in range(len(source_shapes))
         ]
         source_vars = []
         for var_name, shape, dtype, lod_level in zip(
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 3096389101..c0d72620b1 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-import cStringIO
 import functools
 import warnings
 import string
 
+from six.moves import cStringIO
 from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
@@ -70,7 +70,7 @@ def _generate_doc_string_(op_proto):
     if not isinstance(op_proto, framework_pb2.OpProto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
-    buf = cStringIO.StringIO()
+    buf = cStringIO()
     buf.write(escape_math(op_proto.comment))
     buf.write('\nArgs:\n')
     for each_input in op_proto.inputs:
@@ -119,9 +119,9 @@ def generate_layer_fn(op_type):
     """
     op_proto = OpProtoHolder.instance().get_op_proto(op_type)
     not_intermediate_outputs = \
-        filter(lambda output: not output.intermediate, op_proto.outputs)
+        [output for output in op_proto.outputs if not output.intermediate]
     intermediate_outputs = \
-        filter(lambda output: output.intermediate, op_proto.outputs)
+        [output for output in op_proto.outputs if output.intermediate]
 
     if len(not_intermediate_outputs) != 1:
         raise ValueError("Only one non intermediate output operator can be",
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index c7966e36f1..daf91a40f7 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -20,10 +20,10 @@ User can also implement their own learning_rate_decay
 strategy according to this module.
 """
 
-import control_flow
-import nn
-import ops
-import tensor
+from . import control_flow
+from . import nn
+from . import ops
+from . import tensor
 from ..initializer import init_on_cpu
 from ..framework import default_main_program, Parameter
 
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index f814c41633..0e10a91d25 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from ..framework import Variable, unique_name
-from layer_function_generator import OpProtoHolder
+from .layer_function_generator import OpProtoHolder
 from ..initializer import force_init_on_cpu
 
 
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index e7d7a9e826..49bae1e8af 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -20,7 +20,7 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-import nn
+from . import nn
 
 __all__ = ['accuracy', 'auc']
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 5d7f1eadd9..0960b54123 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -33,11 +33,12 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc, templatedoc
-from tensor import concat
-import utils
+from .layer_function_generator import autodoc, templatedoc
+from .tensor import concat
+from . import utils
 import random
 from .. import unique_name
+from functools import reduce
 
 __all__ = [
     'fc',
@@ -949,6 +950,10 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     helper = LayerHelper('dropout', **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
     mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+
+    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+        seed = helper.main_program.random_seed
+
     helper.append_op(
         type='dropout',
         inputs={'X': [x]},
@@ -1313,13 +1318,16 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
 
 def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     """
-    The input of the softmax layer is a 2-D tensor with shape N x K (N is the
-    batch_size, K is the dimension of input feature). The output tensor has the
-    same shape as the input tensor.
+    The input of the softmax operator is a tensor of any rank. The output tensor 
+    has the same shape as the input.
 
-    For each row of the input tensor, the softmax operator squashes the
-    K-dimensional vector of arbitrary real values to a K-dimensional vector of real
-    values in the range [0, 1] that add up to 1.
+    The input tensor will first be logically flattened to a 2-D matrix. The matrix's 
+    second dimension(row length) is as same as the last dimension of the input 
+    tensor, and the first dimension(column length) is the product of all other 
+    dimensions of the input tensor. For each row of the matrix, the softmax operator 
+    squashes the K-dimensional(K is the width of the matrix, which is also the size 
+    of the input tensor's last dimension) vector of arbitrary real values to a 
+    K-dimensional vector of real values in the range [0, 1] that add up to 1.
 
     It computes the exponential of the given dimension and the sum of exponential
     values of all the other dimensions in the K-dimensional vector input.
@@ -1327,7 +1335,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     exponential values of all the other dimensions is the output of the softmax
     operator.
 
-    For each row :math:`i` and each column :math:`j` in Input(X), we have:
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
 
     .. math::
 
@@ -4473,15 +4481,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                 "except one unknown dimension.")
 
     helper = LayerHelper("reshape", **locals())
-    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
         type="reshape",
         inputs=inputs,
-        attrs={"shape": shape,
-               "inplace": inplace},
-        outputs={"Out": reshaped})
+        attrs={"shape": shape},
+        outputs={"Out": out})
 
-    return helper.append_activation(reshaped)
+    return helper.append_activation(out)
 
 
 def lod_reset(x, y=None, target_lod=None):
@@ -4843,7 +4850,7 @@ def dice_loss(input, label, epsilon=0.00001):
             loss = fluid.layers.dice_loss(input=predictions, label=label, 2)
     """
     label = one_hot(label, depth=input.shape[-1])
-    reduce_dim = range(1, len(input.shape))
+    reduce_dim = list(range(1, len(input.shape)))
     inse = reduce_sum(input * label, dim=reduce_dim)
     dice_denominator = reduce_sum(
         input, dim=reduce_dim) + reduce_sum(
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 9e97ec9a6f..f70c7f2258 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from layer_function_generator import generate_layer_fn
+from .layer_function_generator import generate_layer_fn
 
 __activations__ = [
     'sigmoid',
@@ -66,9 +66,7 @@ __all__ = [
     'scatter',
     'sum',
     'slice',
-    'polygon_box_transform',
     'shape',
-    'iou_similarity',
     'maxout',
 ] + __activations__
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index b6614ecf3b..b93d721c12 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -18,7 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
-from layer_function_generator import templatedoc
+from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index b2b3186c1e..53c33616f5 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import core
+from . import core
 import numpy as np
 
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
@@ -24,7 +24,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
 
     Create a lod tensor by doing the following:
 
-    1. Check that the length-based level of detail (LoD) also known as 
+    1. Check that the length-based level of detail (LoD) also known as
        recursive_sequence_lengths of the input is valid.
 
     2. Convert recursive_sequence_lengths to a offset-based LoD.
@@ -33,7 +33,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
        CPU or GPU device (based on input place).
 
     4. Set the level of detail (LoD) using the offset-based LoD.
-    
+
     Examples:
 
         Suppose we want LoDTensor to hold data for sequences of word, where each
@@ -51,7 +51,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
     Args:
         data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
             list holding the data to be copied.
-        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail
             info specified by the user.
         place(Place): CPU or GPU place indicating where the data in the new
             LoDTensor will be stored.
@@ -62,10 +62,10 @@ def create_lod_tensor(data, recursive_seq_lens, place):
     if isinstance(data, core.LoDTensor):
         return create_lod_tensor(np.array(data), recursive_seq_lens, place)
     elif isinstance(data, list):
-        # When input data is a list, it only deal with the case where the base element 
-        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
-        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
-        # of words or other indexes in the sequence. 
+        # When input data is a list, it only deal with the case where the base element
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number
+        # of words or other indexes in the sequence.
         new_recursive_seq_lens = []
         for seq in data:
             new_recursive_seq_lens.append(len(seq))
@@ -109,12 +109,12 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
     Suppose we want LoDTensor to hold data for sequences of word, where each
     word is represented by an integer. If we want to create a LoDTensor to
     represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
-    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]].
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words
     for two sentences.
 
     Args:
-        recursive_seq_lens(list): a list of lists indicating the length-based 
+        recursive_seq_lens(list): a list of lists indicating the length-based
             level of detail info specified by the user.
         base_shape(list): the shape of the basic element to be held by the
             LoDTensor.
@@ -124,11 +124,11 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
         high(int): the upper bound of the random integers.
 
     Returns:
-        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
     # append the total number of basic elements to the front of its shape
     overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
-    # the range of integer data elements is [low, high]    
+    # the range of integer data elements is [low, high]
     data = np.random.random_integers(low, high, overall_shape).astype("int64")
     return create_lod_tensor(data, recursive_seq_lens, place)
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index b37b09ac81..cd89345227 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -79,10 +79,10 @@ class MetricBase(object):
         """
         states = {
             attr: value
-            for attr, value in self.__dict__.iteritems()
+            for attr, value in list(self.__dict__.items())
             if not attr.startswith("_")
         }
-        for attr, value in states.iteritems():
+        for attr, value in list(states.items()):
             if isinstance(value, int):
                 setattr(self, attr, 0)
             elif isinstance(value, float):
@@ -105,7 +105,7 @@ class MetricBase(object):
         """
         states = {
             attr: value
-            for attr, value in self.__dict__.iteritems()
+            for attr, value in list(self.__dict__.items())
             if not attr.startswith("_")
         }
         config = {}
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index 73946a0721..623a7d3fd0 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 try:
-    from graphviz import Digraph
+    from .graphviz import Digraph
 except ImportError:
     logger.info(
         'Cannot import graphviz, which is required for drawing a network. This '
@@ -77,7 +77,7 @@ def parse_graph(program, graph, var_dict, **kwargs):
     # fill the known variables
     for block in program.blocks:
         for var in block.vars:
-            if not var_dict.has_key(var):
+            if var not in var_dict:
                 var_dict[var] = "Feed"
 
     temp_id = 0
@@ -93,17 +93,17 @@ def parse_graph(program, graph, var_dict, **kwargs):
                     var_dict[arg] = op.type
             for e in op.inputs:
                 for arg in e.arguments:
-                    if var_dict.has_key(arg):
+                    if arg in var_dict:
                         graph.edge(**draw_edge(var_dict, op, e, arg))
         break  # only plot the first block
 
 
 def draw_graph(startup_program, main_program, **kwargs):
-    if kwargs.has_key("graph_attr"):
+    if "graph_attr" in kwargs:
         GRAPH_STYLE.update(kwargs[graph_attr])
-    if kwargs.has_key("node_attr"):
+    if "node_attr" in kwargs:
         OP_STYLE.update(kwargs[node_attr])
-    if kwargs.has_key("edge_attr"):
+    if "edge_attr" in kwargs:
         VAR_STYLE.update(kwargs[edge_attr])
 
     graph_id = unique_id()
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 9b3f2aebee..08480671d8 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import layers
+from . import layers
 
 __all__ = [
     "simple_img_conv_pool",
@@ -210,7 +210,7 @@ def img_conv_group(input,
     conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
     conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
 
-    for i in xrange(len(conv_num_filter)):
+    for i in range(len(conv_num_filter)):
         local_conv_act = conv_act
         if conv_with_batchnorm[i]:
             local_conv_act = None
@@ -488,10 +488,11 @@ def scaled_dot_product_attention(queries,
         trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
         return layers.reshape(
             x=trans_x,
-            shape=map(int, [
-                trans_x.shape[0], trans_x.shape[1],
-                trans_x.shape[2] * trans_x.shape[3]
-            ]))
+            shape=list(
+                map(int, [
+                    trans_x.shape[0], trans_x.shape[1], trans_x.shape[2] *
+                    trans_x.shape[3]
+                ])))
 
     q, k, v = __compute_qkv(queries, keys, values, num_heads)
 
diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py
index 0b76e94157..93f021a360 100644
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import six
+
 import paddle.fluid.core as core
 import paddle.fluid.proto.framework_pb2 as framework_pb2
 
@@ -24,13 +26,13 @@ def get_all_op_protos():
     protostrs = core.get_all_op_protos()
     ret_values = []
     for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
         ret_values.append(op_proto)
     return ret_values
 
 
 def is_str(s):
-    return isinstance(s, str) or isinstance(s, unicode)
+    return isinstance(s, six.string_types)
 
 
 class OpDescCreationMethod(object):
@@ -189,7 +191,7 @@ class OperatorFactory(object):
         return self.get_op_info(t).method(**kwargs)
 
     def types(self):
-        return self.op_methods.keys()
+        return list(self.op_methods.keys())
 
     def get_op_info(self, t):
         if t not in self.op_methods:
@@ -197,13 +199,13 @@ class OperatorFactory(object):
         return self.op_methods.get(t)
 
     def get_op_input_names(self, type):
-        return map(lambda x: x[0], self.get_op_info(type).inputs)
+        return [x[0] for x in self.get_op_info(type).inputs]
 
     def get_op_inputs(self, type):
         return self.get_op_info(type).inputs
 
     def get_op_output_names(self, type):
-        return map(lambda x: x[0], self.get_op_info(type).outputs)
+        return [x[0] for x in self.get_op_info(type).outputs]
 
     def get_op_outputs(self, type):
         return self.get_op_info(type).outputs
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 3fe99f5501..a07325f46a 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -14,15 +14,15 @@
 import re
 from collections import defaultdict
 from paddle.fluid.framework import Program, Variable
-import framework
-import layers
-from backward import append_backward
-from framework import program_guard
-import unique_name
-from initializer import Constant
-from layer_helper import LayerHelper
-from regularizer import append_regularization_ops
-from clip import append_gradient_clip_ops, error_clip_callback
+from . import framework
+from . import layers
+from .backward import append_backward
+from .framework import program_guard
+from . import unique_name
+from .initializer import Constant
+from .layer_helper import LayerHelper
+from .regularizer import append_regularization_ops
+from .clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager
 
 __all__ = [
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 10028a8c6e..eabe6bb901 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import core
+from __future__ import print_function
 import multiprocessing
-import framework
-import executor
+from . import core
+from . import framework
+from . import executor
 import warnings
 import sys
 import os
@@ -94,7 +95,7 @@ class ParallelExecutor(object):
         self._places = []
         self._act_places = []
         if use_cuda:
-            for i in xrange(core.get_cuda_device_count()):
+            for i in range(core.get_cuda_device_count()):
                 p = core.Place()
                 self._act_places.append(core.CUDAPlace(i))
                 p.set_place(self._act_places[-1])
@@ -102,7 +103,7 @@ class ParallelExecutor(object):
         else:
             cpu_num = int(
                 os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            for i in xrange(cpu_num):
+            for i in range(cpu_num):
                 p = core.Place()
                 self._act_places.append(core.CPUPlace())
                 p.set_place(self._act_places[-1])
@@ -121,7 +122,7 @@ class ParallelExecutor(object):
             else:
                 cpu_num = int(
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                exec_strategy.num_threads = cpu_num
+                exec_strategy.num_threads = cpu_num * 2
 
         if build_strategy is None:
             build_strategy = BuildStrategy()
@@ -143,16 +144,16 @@ class ParallelExecutor(object):
         ) if share_vars_from else []
 
         self.persistable_vars = [
-            v.name
-            for v in filter(
-                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
-                main.list_vars())
+            v.name for v in [
+                var for var in main.list_vars()
+                if var.persistable and var.type != core.VarDesc.VarType.RAW
+            ]
         ]
 
         self.executor = core.ParallelExecutor(
             self._places,
             set([
-                p.name for p in main.global_block()._iter_parameters()
+                p.name for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
             set(self.persistable_vars), main.desc, loss_name
@@ -227,7 +228,9 @@ class ParallelExecutor(object):
         """
         if feed is None and feed_dict is not None:
             feed = feed_dict
-            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
+            print(
+                "`feed_dict` is deprecated. Please use `feed=`",
+                file=sys.stderr)
 
         if isinstance(feed, dict):
             feed_tensor_dict = dict()
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 4a61f85ec4..afae577656 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from initializer import Initializer, Xavier, Constant
-from regularizer import WeightDecayRegularizer
+import six
+
+from .initializer import Initializer, Xavier, Constant
+from .regularizer import WeightDecayRegularizer
 
 __all__ = [
     'ParamAttr',
@@ -134,7 +136,7 @@ class ParamAttr(object):
             return [ParamAttr._to_attr(a) for a in arg]
         elif isinstance(arg, ParamAttr):
             return arg
-        elif isinstance(arg, str) or isinstance(arg, unicode):
+        elif isinstance(arg, six.string_types):
             return ParamAttr(name=arg)
         elif isinstance(arg, Initializer):
             return ParamAttr(initializer=arg)
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 6a321ae024..01983a8303 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import core
+from . import core
 from contextlib import contextmanager
 import os
 
@@ -218,20 +218,20 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
 def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
     """The profiler interface.
     Different from cuda_profiler, this profiler can be used to profile both CPU
-    and GPU program. By defalut, it records the CPU and GPU operator kernels,
+    and GPU program. By default, it records the CPU and GPU operator kernels,
     if you want to profile other program, you can refer the profiling tutorial
     to add more records in C++ code.
 
     If the state == 'All', a profile proto file will be written to
     `profile_path`. This file records timeline information during the execution.
-    Then users can visualize this file to see the timeline, please refer 
+    Then users can visualize this file to see the timeline, please refer
     https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
 
     Args:
         state (string) : The profiling state, which should be 'CPU' or 'GPU',
             telling the profiler to use CPU timer or GPU timer for profiling.
             Although users may have already specified the execution place
-            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            (CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler
             would not inherit this place.
         sorted_key (string) : If None, the profiling results will be printed
             in the order of first end time of events. Otherwise, the profiling
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index bd57772713..93b38ad3fa 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import os
-import core
 import contextlib
+from . import core
 __all__ = [
     'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
 ]
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 080c185420..6eaac4432d 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import framework
+from . import framework
 from . import core
 
 __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
@@ -142,14 +142,20 @@ class L2DecayRegularizer(WeightDecayRegularizer):
             dtype="float32", shape=param.shape, lod_level=param.lod_level)
 
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            idx = block.create_var(
+                dtype="int64",
+                shape=param.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR)
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
                 type=core.VarDesc.VarType.SELECTED_ROWS)
+            block.append_op(
+                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
                 type='lookup_table',
                 inputs={'W': param,
-                        'Ids': grad},
+                        'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
             param = decay
@@ -216,14 +222,20 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             dtype="float32", shape=param.shape, lod_level=param.lod_level)
 
         if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            idx = block.create_var(
+                dtype="int64",
+                shape=param.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR)
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
                 type=core.VarDesc.VarType.SELECTED_ROWS)
+            block.append_op(
+                type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
                 type='lookup_table',
                 inputs={'W': param,
-                        'Ids': grad},
+                        'Ids': idx},
                 outputs={'Out': decay},
                 attrs={'is_sparse': True})
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index ad28c9eff5..36a1a223cf 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -63,7 +63,7 @@ def train(use_cuda, train_program, params_dirname):
             if event.step == 10:
                 test_metrics = trainer.test(
                     reader=test_reader, feed_order=['x', 'y'])
-                print test_metrics
+                print(test_metrics)
                 '''
                 ...
                 ['25.768919467926025']
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
index 7fed6d914f..9e4c384d92 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
@@ -28,11 +28,12 @@ images per class.
 
 """
 
-import cPickle
 import itertools
 import numpy
 import paddle.v2.dataset.common
 import tarfile
+from six.moves import cPickle as pickle
+from six.moves import zip
 
 __all__ = ['train10']
 
@@ -46,7 +47,7 @@ def reader_creator(filename, sub_name, batch_size=None):
         data = batch['data']
         labels = batch.get('labels', batch.get('fine_labels', None))
         assert labels is not None
-        for sample, label in itertools.izip(data, labels):
+        for sample, label in zip(data, labels):
             yield (sample / 255.0).astype(numpy.float32), int(label)
 
     def reader():
@@ -56,7 +57,7 @@ def reader_creator(filename, sub_name, batch_size=None):
 
             batch_count = 0
             for name in names:
-                batch = cPickle.load(f.extractfile(name))
+                batch = pickle.load(f.extractfile(name))
                 for item in read_batch(batch):
                     if isinstance(batch_size, int) and batch_count > batch_size:
                         break
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index 8e222d2690..a1f62db093 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import numpy
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index dbc7bc06c9..8429551765 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import numpy
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index 67aa21e8c5..e3602e2d56 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import numpy as np
@@ -178,14 +176,15 @@ def train(use_cuda, train_program, params_dirname):
             if float(avg_cost) < 100.0:  # Large value to increase CI speed
                 trainer.save_params(params_dirname)
             else:
-                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-                                                              float(avg_cost)))
+                print(
+                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                             float(avg_cost))))
                 if math.isnan(float(avg_cost)):
                     sys.exit("got NaN loss, training failed.")
 
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -207,14 +206,14 @@ def infer(use_cuda, inference_program, params_dirname):
         inference_program, param_path=params_dirname, place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index 8becd2404b..6fb0c85a8b 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -250,7 +250,7 @@ def decode_main(use_cuda, is_sparse):
     feeder = fluid.DataFeeder(feed_list, place)
 
     for data in train_data():
-        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict = feeder.feed([[x[0]] for x in data])
         feed_dict['init_ids'] = init_ids
         feed_dict['init_scores'] = init_scores
 
@@ -259,7 +259,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.recursive_sequence_lengths()
+        print(result_ids.recursive_sequence_lengths())
         break
 
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index fd278f45f1..898807db6f 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
+
 import argparse
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -89,8 +89,10 @@ def train(use_cuda, train_program, params_dirname):
                 if math.isnan(avg_cost):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
-            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(numpy.array, event.metrics)))
+            print(
+                ("Step {0}, Epoch {1} Metrics {2}".format(
+                    event.step, event.epoch,
+                    list(map(numpy.array, event.metrics)))))
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index b2b544e791..6dd64be315 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
+
 import argparse
 import paddle.fluid as fluid
 import paddle
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index c860f16417..60f3d8e105 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -186,8 +186,9 @@ def train(use_cuda, train_program, params_dirname):
                 trainer.save_params(params_dirname)
                 trainer.stop()
             else:
-                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-                                                              float(avg_cost)))
+                print(
+                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                             float(avg_cost))))
                 if math.isnan(float(avg_cost)):
                     sys.exit("got NaN loss, training failed.")
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 1668ae83d3..24e65d1bd5 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -98,7 +96,7 @@ def train(use_cuda, train_program, params_dirname):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -125,14 +123,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index 8da89d82cb..b3b1505a0f 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -113,7 +111,7 @@ def train(use_cuda, train_program, params_dirname):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -140,14 +138,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 74faa2e8aa..25f99ff0fd 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -107,7 +105,7 @@ def train(use_cuda, train_program, params_dirname):
                     sys.exit("got NaN loss, training failed.")
         elif isinstance(event, fluid.EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -135,14 +133,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
     # look up for the corresponding word vector.
     # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
     # Note that recursive_sequence_lengths should be a list of lists.
     recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 95002aa7f9..ce6342c2da 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
+
 from paddle.fluid.layers.device import get_places
 import unittest
 import paddle.fluid as fluid
@@ -175,7 +175,7 @@ def train(word_dict,
     def train_loop(main_program):
         exe.run(fluid.default_startup_program())
 
-        for pass_id in xrange(PASS_NUM):
+        for pass_id in range(PASS_NUM):
             for data in train_data():
                 cost_val, acc_val = exe.run(main_program,
                                             feed=feeder.feed(data),
@@ -235,14 +235,14 @@ def infer(word_dict, use_cuda, save_dirname=None):
         word_dict_len = len(word_dict)
 
         # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
         # look up for the corresponding word vector.
         # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for three sentences of 
-        # length 3, 4 and 2, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for three sentences of
+        # length 3, 4 and 2, respectively.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 71bf5f8b3a..37b64fa94a 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -114,7 +114,7 @@ def infer(use_cuda, save_dirname=None):
         test_reader = paddle.batch(
             paddle.dataset.uci_housing.test(), batch_size=batch_size)
 
-        test_data = test_reader().next()
+        test_data = next(test_reader())
         test_feat = numpy.array(
             [data[0] for data in test_data]).astype("float32")
         test_label = numpy.array(
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index a2fb186b86..de6fe5f140 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import contextlib
@@ -121,7 +119,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
     avg_cost = fluid.layers.mean(cost)
     acc = fluid.layers.accuracy(input=predict, label=label)
 
-    # Test program 
+    # Test program
     test_program = fluid.default_main_program().clone(for_test=True)
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index d489feae9c..b7ac911caf 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -181,7 +181,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
 
         start_time = time.time()
         batch_id = 0
-        for pass_id in xrange(PASS_NUM):
+        for pass_id in range(PASS_NUM):
             for data in train_data():
                 cost = exe.run(main_program,
                                feed=feeder.feed(data),
@@ -248,14 +248,14 @@ def infer(use_cuda, save_dirname=None):
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
         # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
         # look up for the corresponding word vector.
         # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for three sentences of 
-        # length 3, 4 and 2, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for three sentences of
+        # length 3, 4 and 2, respectively.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 90c301a661..462faad3e1 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -199,7 +199,7 @@ def train_main(use_cuda, is_sparse, is_local=True):
         feeder = fluid.DataFeeder(feed_list, place)
 
         batch_id = 0
-        for pass_id in xrange(1):
+        for pass_id in range(1):
             for data in train_data():
                 outs = exe.run(main_program,
                                feed=feeder.feed(data),
@@ -273,7 +273,7 @@ def decode_main(use_cuda, is_sparse):
     feeder = fluid.DataFeeder(feed_list, place)
 
     for data in train_data():
-        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict = feeder.feed([[x[0]] for x in data])
         feed_dict['init_ids'] = init_ids
         feed_dict['init_scores'] = init_scores
 
@@ -282,7 +282,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.recursive_sequence_lengths()
+        print(result_ids.recursive_sequence_lengths())
         break
 
 
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index c471863920..3e5f76d12d 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 
 import paddle.fluid.core as core
 import math
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 6548766ef5..b30c8771fc 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -260,15 +260,15 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
-        # to generate LoD Tensor where `data` is a list of sequences of index 
-        # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API
+        # to generate LoD Tensor where `data` is a list of sequences of index
+        # numbers, `recursive_sequence_lengths` is the length-based level of detail
         # (lod) info associated with `data`.
         # For example, data = [[10, 2, 3], [2, 3]] means that it contains
         # two sequences of indexes, of length 3 and 2, respectively.
-        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
-        # level of detail info, indicating that `data` consists of two sequences 
-        # of length 3 and 2, respectively. 
+        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
+        # level of detail info, indicating that `data` consists of two sequences
+        # of length 3 and 2, respectively.
         user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 4672826241..2e79be2bd0 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -175,7 +175,7 @@ def train(use_cuda, save_dirname=None):
     feeder = fluid.DataFeeder(feed_list, place)
 
     batch_id = 0
-    for pass_id in xrange(2):
+    for pass_id in range(2):
         for data in train_data():
             outs = exe.run(framework.default_main_program(),
                            feed=feeder.feed(data),
@@ -213,14 +213,14 @@ def infer(use_cuda, save_dirname=None):
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
         # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
         # look up for the corresponding word vector.
         # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for two sentences of 
-        # length 4 and 6, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for two sentences of
+        # length 4 and 6, respectively.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[4, 6]]
         base_shape = [1]
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 3b957508ca..e761e05795 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -85,9 +85,11 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
             avg_cost, predict_word = __network__(
-                map(pd.read_input, [
-                    first_word, second_word, third_word, forth_word, next_word
-                ]))
+                list(
+                    map(pd.read_input, [
+                        first_word, second_word, third_word, forth_word,
+                        next_word
+                    ])))
             pd.write_output(avg_cost)
 
         avg_cost = fluid.layers.mean(pd())
@@ -167,11 +169,11 @@ def infer(use_cuda, save_dirname=None):
         word_dict = paddle.dataset.imikolov.build_dict()
         dict_size = len(word_dict)
 
-        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
-        # is simply an index to look up for the corresponding word vector and hence 
-        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
-        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
-        # meaning there is only one level of detail and there is only one sequence of 
+        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
+        # is simply an index to look up for the corresponding word vector and hence
+        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths,
+        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]]
+        # meaning there is only one level of detail and there is only one sequence of
         # one word on this level.
         # Note that recursive_sequence_lengths should be a list of lists.
         recursive_seq_lens = [[1]]
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index bec9f8594f..ccc62b442f 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -78,7 +78,7 @@ for pass_id in range(PASS_NUM):
 
         if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
-        print avg_loss_value[0]
+        print(avg_loss_value[0])
         if math.isnan(float(avg_loss_value)):
             sys.exit("got NaN loss, training failed.")
 exit(1)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index dfebb9a06e..b2a59d27da 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import sys
 
 import paddle
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index fa696acdfa..323ddfb691 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -118,7 +118,7 @@ def main():
     feeder = fluid.DataFeeder(feed_list, place)
 
     batch_id = 0
-    for pass_id in xrange(10):
+    for pass_id in range(10):
         for data in train_data():
             outs = exe.run(fluid.default_main_program(),
                            feed=feeder.feed(data),
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
index 8ea1b2b15c..3d92f50f0a 100644
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -137,7 +137,7 @@ def main():
             generated_img = exe.run(g_program,
                                     feed={'noise': n},
                                     fetch_list={g_img})[0]
-            real_data = numpy.array(map(lambda x: x[0], data)).astype('float32')
+            real_data = numpy.array([x[0] for x in data]).astype('float32')
             real_data = real_data.reshape(num_true, 784)
             total_data = numpy.concatenate([real_data, generated_img])
             total_label = numpy.concatenate([
@@ -150,7 +150,7 @@ def main():
                                 feed={'img': total_data,
                                       'label': total_label},
                                 fetch_list={d_loss})[0]
-            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+            for _ in range(NUM_TRAIN_TIMES_OF_DG):
                 n = numpy.random.uniform(
                     low=-1.0, high=1.0,
                     size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
diff --git a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
index b839e14889..a00325d79b 100644
--- a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -36,7 +36,7 @@ if len(sys.argv) == 1:
 else:
     word_dict = load_vocab(sys.argv[1])
     word_dict["<unk>"] = len(word_dict)
-print "Dict dim = ", len(word_dict)
+print("Dict dim = ", len(word_dict))
 
 # input text data
 data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1)
diff --git a/python/paddle/fluid/tests/no_test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py
index e8f6cfb4a9..3bc0c9808e 100644
--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
@@ -194,7 +194,7 @@ class TestRoutineOp(unittest.TestCase):
             quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
 
             with fluid.Go():
-                for i in xrange(10):
+                for i in range(10):
                     fluid.channel_recv(ch1, result)
                     Print(result)
 
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
index 7a2502fa2f..8bf750940d 100644
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -155,7 +155,7 @@ def train_main(use_cuda):
         ]
         feeder = fluid.DataFeeder(feed_list, place)
 
-        for pass_id in xrange(1):
+        for pass_id in range(1):
             for batch_id, data in enumerate(train_reader()):
                 outs = exe.run(main_program,
                                feed=feeder.feed(data),
@@ -204,8 +204,8 @@ def decode_main(use_cuda):
     ]
     feeder = fluid.DataFeeder(feed_list, place)
 
-    data = train_reader().next()
-    feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+    data = next(train_reader())
+    feed_dict = feeder.feed([[x[0]] for x in data])
     feed_dict['init_ids'] = init_ids
     feed_dict['init_scores'] = init_scores
 
@@ -214,7 +214,7 @@ def decode_main(use_cuda):
         feed=feed_dict,
         fetch_list=[translation_ids, translation_scores],
         return_numpy=False)
-    print result_ids.lod()
+    print(result_ids.lod())
 
 
 class TestBeamSearchDecoder(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 2d70c986b1..fd45abd0a7 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index 3dc858971c..e8edd7fbbb 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import numpy as np
 import paddle
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
index 799c31dfe5..082f64c146 100644
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -76,15 +76,15 @@ class TestMNISTIfElseOp(unittest.TestCase):
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                x_data = np.array([x[0] for x in data]).astype("float32")
+                y_data = np.array([x[1] for x in data]).astype("int64")
                 y_data = np.expand_dims(y_data, axis=1)
 
                 outs = exe.run(prog,
                                feed={'x': x_data,
                                      'y': y_data},
                                fetch_list=[avg_loss])
-                print outs[0]
+                print(outs[0])
                 if outs[0] < 1.0:
                     return
         self.assertFalse(True)
@@ -131,15 +131,15 @@ class TestMNISTIfElseOp(unittest.TestCase):
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                x_data = np.array([x[0] for x in data]).astype("float32")
+                y_data = np.array([x[1] for x in data]).astype("int64")
                 y_data = y_data.reshape((y_data.shape[0], 1))
 
                 outs = exe.run(prog,
                                feed={'x': x_data,
                                      'y': y_data},
                                fetch_list=[avg_loss])
-                print outs[0]
+                print(outs[0])
                 if outs[0] < 1.0:
                     return
         self.assertFalse(True)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 322d76515e..a6a911721d 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -40,7 +40,7 @@ function(py_test_modules TARGET_NAME)
              ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (py_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
   endif()
 endfunction()
@@ -49,6 +49,9 @@ list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+list(REMOVE_ITEM TEST_OPS test_dist_transformer)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
+list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -61,4 +64,7 @@ if(WITH_DISTRIBUTE)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
 py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
+py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
+py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
index e891ee932f..b98a92dcbe 100644
--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -16,6 +16,7 @@ import numpy as np
 import unittest
 import time
 import itertools
+import six
 
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -40,8 +41,8 @@ class BenchmarkSuite(OpTest):
             expect_t = np.array(item_cpu_out)
             actual = item_gpu_out
             actual_t = np.array(item_gpu_out)
-            var_name = variable if isinstance(variable,
-                                              basestring) else variable.name
+            var_name = variable if isinstance(
+                variable, six.string_types) else variable.name
             self.assertTrue(
                 np.allclose(
                     actual_t, expect_t, atol=atol),
@@ -53,7 +54,7 @@ class BenchmarkSuite(OpTest):
 
     def _get_input_names(self):
         inputs = []
-        for name, value in self.inputs.iteritems():
+        for name, value in list(self.inputs.items()):
             if isinstance(value, list):
                 inputs.extend([sub_name for sub_name, _ in value])
             inputs.append(name)
@@ -61,7 +62,7 @@ class BenchmarkSuite(OpTest):
 
     def _get_output_names(self):
         outputs = []
-        for var_name, var in self.outputs.iteritems():
+        for var_name, var in list(self.outputs.items()):
             if isinstance(var, list):
                 for sub_var_name, sub_var in var:
                     outputs.append(sub_var_name)
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index 72bc1729b0..62ca67ae4c 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import argparse
+import six
 import time
 import math
 
@@ -174,6 +175,9 @@ class SE_ResNeXt():
             padding=(filter_size - 1) / 2,
             groups=groups,
             act=None,
+            # avoid pserver CPU init differs from GPU
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant()),
             bias_attr=False)
         return fluid.layers.batch_norm(input=conv, act=act)
 
@@ -194,10 +198,8 @@ class SE_ResNeXt():
 
 def get_model(batch_size):
     # Input data
-    image = fluid.layers.fill_constant(
-        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
-    label = fluid.layers.fill_constant(
-        shape=[batch_size, 1], dtype='int64', value=0.0)
+    image = fluid.layers.data(name="data", shape=[3, 224, 224], dtype='float32')
+    label = fluid.layers.data(name="int64", shape=[1], dtype='int64')
 
     # Train program
     model = SE_ResNeXt(layers=50)
@@ -222,8 +224,10 @@ def get_model(batch_size):
     lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
 
     optimizer = fluid.optimizer.Momentum(
-        learning_rate=fluid.layers.piecewise_decay(
-            boundaries=bd, values=lr),
+        # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed.
+        #learning_rate=fluid.layers.piecewise_decay(
+        #    boundaries=bd, values=lr),
+        learning_rate=base_lr,
         momentum=0.9,
         regularization=fluid.regularizer.L2Decay(1e-4))
     optimizer.minimize(avg_cost)
@@ -232,7 +236,7 @@ def get_model(batch_size):
     train_reader = paddle.batch(
         paddle.dataset.flowers.train(), batch_size=batch_size)
     test_reader = paddle.batch(
-        paddle.dataset.flowers.test(), batch_size=batch_size)
+        paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
 
     return test_program, avg_cost, train_reader, test_reader, acc_top1, out
 
@@ -256,7 +260,6 @@ class DistSeResneXt2x2:
                            trainers)
         pserver_prog = t.get_pserver_program(current_endpoint)
         startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
-
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup_prog)
@@ -278,7 +281,7 @@ class DistSeResneXt2x2:
 
     def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
-            batch_size=20)
+            batch_size=2)
         if is_dist:
             t = get_transpiler(trainer_id,
                                fluid.default_main_program(), endpoints,
@@ -294,24 +297,27 @@ class DistSeResneXt2x2:
         strategy.num_threads = 1
         strategy.allow_op_delay = False
         exe = fluid.ParallelExecutor(
-            True,
-            loss_name=avg_cost.name,
-            exec_strategy=strategy,
-            num_trainers=trainers,
-            trainer_id=trainer_id)
+            True, loss_name=avg_cost.name, exec_strategy=strategy)
 
         feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
+            var for var in trainer_prog.global_block().vars.values()
             if var.is_data
         ]
 
         feeder = fluid.DataFeeder(feed_var_list, place)
-        reader_generator = train_reader()
-        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        reader_generator = test_reader()
+
+        data = next(reader_generator)
+        first_loss, = exe.run(fetch_list=[avg_cost.name],
+                              feed=feeder.feed(data))
         print(first_loss)
-        for i in xrange(5):
-            loss, = exe.run(fetch_list=[avg_cost.name])
-        last_loss, = exe.run(fetch_list=[avg_cost.name])
+
+        for i in six.moves.xrange(5):
+            data = next(reader_generator)
+            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
+
+        data = next(reader_generator)
+        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
         print(last_loss)
 
 
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
new file mode 100644
index 0000000000..ee8020a735
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -0,0 +1,280 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+import os
+import sys
+import transformer_model
+import paddle.dataset.wmt16 as wmt16
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
+
+
+class ModelHyperParams(object):
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # alreay been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(insts,
+                         pad_idx,
+                         is_target=False,
+                         return_pos=True,
+                         return_attn_bias=True,
+                         return_max_len=True):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array([[
+                pos_i + 1 if w_i != pad_idx else 0
+                for pos_i, w_i in enumerate(inst)
+            ] for inst in inst_data])
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
+                                              max_len))
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len])
+                slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                             [1, n_head, 1, 1]) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                               (max_len - len(inst))
+                                               for inst in insts])
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1])
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
+                                False, False, False)
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+    ]
+
+
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        ModelHyperParams.d_key, ModelHyperParams.d_value,
+        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+
+
+def get_model():
+    avg_cost = transformer(use_feed=False)
+    optimizer = fluid.optimizer.Adam()
+    optimizer.minimize(avg_cost)
+    return avg_cost
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+class DistTransformer2x2(object):
+    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
+                    trainer_id):
+        get_model()
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            print("waiting ps ready: ", pid)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+        avg_cost = get_model()
+        if is_dist:
+            t = get_transpiler(trainer_id,
+                               fluid.default_main_program(), endpoints,
+                               trainers)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        exe = fluid.ParallelExecutor(
+            True, loss_name=avg_cost.name, exec_strategy=strategy)
+
+        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(first_loss)
+        for i in xrange(5):
+            _ = exe.run(fetch_list=[avg_cost.name])
+        last_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(last_loss)
+
+
+def main(role="pserver",
+         endpoints="127.0.0.1:9123",
+         trainer_id=0,
+         current_endpoint="127.0.0.1:9123",
+         trainers=1,
+         is_dist=True):
+
+    reader = paddle.batch(
+        wmt16.train(ModelHyperParams.src_vocab_size,
+                    ModelHyperParams.trg_vocab_size),
+        batch_size=transformer_model.batch_size)
+
+    with fluid.recordio_writer.create_recordio_writer(
+            WMT16_RECORDIO_FILE) as writer:
+        for batch in reader():
+            for tensor in prepare_batch_input(
+                    batch, ModelHyperParams.src_pad_idx,
+                    ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                t = fluid.LoDTensor()
+                t.set(tensor, fluid.CPUPlace())
+                writer.append_tensor(t)
+            writer.complete_append_tensor()
+
+    model = DistTransformer2x2()
+    if role == "pserver":
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print(
+            "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+        )
+    role = sys.argv[1]
+    endpoints = sys.argv[2]
+    trainer_id = int(sys.argv[3])
+    current_endpoint = sys.argv[4]
+    trainers = int(sys.argv[5])
+    is_dist = True if sys.argv[6] == "TRUE" else False
+    main(
+        role=role,
+        endpoints=endpoints,
+        trainer_id=trainer_id,
+        current_endpoint=current_endpoint,
+        trainers=trainers,
+        is_dist=is_dist)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 82b5e7cf0b..b27d773f09 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -26,13 +26,15 @@ from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, OpProtoHolder, Variable
 from testsuite import create_op, set_input, append_input_output, append_loss_ops
+from functools import reduce
+from six.moves import zip
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
     prob = np.random.uniform(
         0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
     prob_sum = prob.sum(axis=1)
-    for i in xrange(len(prob)):
+    for i in range(len(prob)):
         prob[i] /= prob_sum[i]
     return prob
 
@@ -66,6 +68,10 @@ def get_numeric_gradient(place,
         tensor_to_check_dtype = np.float32
     elif tensor_to_check_dtype == core.VarDesc.VarType.FP64:
         tensor_to_check_dtype = np.float64
+    elif tensor_to_check_dtype == core.VarDesc.VarType.FP16:
+        tensor_to_check_dtype = np.float16
+        # set delta as np.float16, will automatic convert to float32, float64
+        delta = np.array(delta).astype(np.float16)
     else:
         raise ValueError("Not supported data type " + str(
             tensor_to_check_dtype))
@@ -73,20 +79,31 @@ def get_numeric_gradient(place,
     gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
 
     def __get_elem__(tensor, i):
-        if tensor_to_check_dtype == np.float32:
+        if tensor_to_check_dtype == np.float16:
+            numpy_tensor = np.array(tensor).astype(np.float16)
+            numpy_tensor = numpy_tensor.flatten()
+            return numpy_tensor[i]
+        elif tensor_to_check_dtype == np.float32:
             return tensor._get_float_element(i)
         else:
             return tensor._get_double_element(i)
 
     def __set_elem__(tensor, i, e):
-        if tensor_to_check_dtype == np.float32:
+        if tensor_to_check_dtype == np.float16:
+            numpy_tensor = np.array(tensor).astype(np.float16)
+            shape = numpy_tensor.shape
+            numpy_tensor = numpy_tensor.flatten()
+            numpy_tensor[i] = e
+            numpy_tensor = numpy_tensor.reshape(shape).view(np.uint16)
+            tensor.set(numpy_tensor, place)
+        elif tensor_to_check_dtype == np.float32:
             tensor._set_float_element(i, e)
         else:
             tensor._set_double_element(i, e)
 
     # we only compute gradient of one element each time.
     # we use a for loop to compute the gradient of every element.
-    for i in xrange(tensor_size):
+    for i in range(tensor_size):
         if in_place:
             set_input(scope, op, inputs, place)
 
@@ -133,13 +150,18 @@ class OpTest(unittest.TestCase):
         if not self.call_once:
             self.call_once = True
             self.dtype = data_type
+            # See the comment of np_dtype_to_fluid_dtype
+            # If the input type is uint16, we assume use float16
+            # for lodtensor dtype.
+            if self.dtype == np.uint16:
+                self.dtype == np.float16
 
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def infer_dtype(numpy_dict):
             assert isinstance(
                 numpy_dict,
                 dict), "self.inputs, self.outputs must be numpy_dict"
-            for var_name, var_value in numpy_dict.iteritems():
+            for var_name, var_value in numpy_dict.items():
                 if isinstance(var_value, (np.ndarray, np.generic)):
                     self.try_call_once(var_value.dtype)
                 elif isinstance(var_value, (list, tuple)):
@@ -161,19 +183,25 @@ class OpTest(unittest.TestCase):
                 for name, np_value in self.inputs[var_name]:
                     tensor = core.LoDTensor()
                     if isinstance(np_value, tuple):
-                        tensor.set(np_value[0], place)
+                        tensor.set(
+                            OpTest.np_value_to_fluid_value(np_value[0]), place)
                         tensor.set_recursive_sequence_lengths(np_value[1])
                     else:
-                        tensor.set(np_value, place)
+                        tensor.set(
+                            OpTest.np_value_to_fluid_value(np_value), place)
                     feed_map[name] = tensor
             else:
                 tensor = core.LoDTensor()
                 if isinstance(self.inputs[var_name], tuple):
-                    tensor.set(self.inputs[var_name][0], place)
+                    tensor.set(
+                        OpTest.np_value_to_fluid_value(self.inputs[var_name][
+                            0]), place)
                     tensor.set_recursive_sequence_lengths(self.inputs[var_name][
                         1])
                 else:
-                    tensor.set(self.inputs[var_name], place)
+                    tensor.set(
+                        OpTest.np_value_to_fluid_value(self.inputs[var_name]),
+                        place)
                 feed_map[var_name] = tensor
 
         return feed_map
@@ -197,7 +225,7 @@ class OpTest(unittest.TestCase):
 
     def _get_io_vars(self, block, numpy_inputs):
         inputs = {}
-        for name, value in numpy_inputs.iteritems():
+        for name, value in numpy_inputs.items():
             if isinstance(value, list):
                 var_list = [
                     block.var(sub_name) for sub_name, sub_value in value
@@ -240,7 +268,7 @@ class OpTest(unittest.TestCase):
         # if the fetch_list is customized by user, we use it directly.
         # if not, fill the fetch_list by the user configured outputs in test.
         if len(fetch_list) == 0:
-            for var_name, var in outputs.iteritems():
+            for var_name, var in outputs.items():
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
@@ -252,7 +280,7 @@ class OpTest(unittest.TestCase):
                 fetch_list.append(str(out_name))
         # fetch_list = map(block.var, fetch_list)
         if not isinstance(fetch_list[0], fluid.framework.Variable):
-            fetch_list = map(block.var, fetch_list)
+            fetch_list = list(map(block.var, fetch_list))
         outs = executor.run(program,
                             feed=feed_map,
                             fetch_list=fetch_list,
@@ -307,13 +335,22 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + "\n" + str(expect_t))
+                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
+                    str(actual_t))
                 if isinstance(expect, tuple):
                     self.assertListEqual(actual.recursive_sequence_lengths(),
                                          expect[1], "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
     def _get_places(self):
+        if self.dtype == np.float16:
+            if core.is_compiled_with_cuda() and core.op_support_gpu(
+                    self.op_type):
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    return [place]
+            else:
+                return []
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
@@ -334,7 +371,7 @@ class OpTest(unittest.TestCase):
     def __assert_is_close(self, numeric_grads, analytic_grads, names,
                           max_relative_error, msg_prefix):
 
-        for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
+        for a, b, name in zip(numeric_grads, analytic_grads, names):
             abs_a = np.abs(a)
             abs_a[abs_a < 1e-3] = 1
 
@@ -344,9 +381,9 @@ class OpTest(unittest.TestCase):
             def err_msg():
                 offset = np.argmax(diff_mat > max_relative_error)
                 return ("%s Variable %s max gradient diff %f over limit %f, "
-                        "the first error element is %d, %f, %f") % (
-                            msg_prefix, name, max_diff, max_relative_error,
-                            offset, a.flatten()[offset], b.flatten()[offset])
+                        "the first error element is %d, expected %f, but got %f"
+                        ) % (msg_prefix, name, max_diff, max_relative_error,
+                             offset, a.flatten()[offset], b.flatten()[offset])
 
             self.assertLessEqual(max_diff, max_relative_error, err_msg())
 
@@ -435,6 +472,21 @@ class OpTest(unittest.TestCase):
             input.dtype = np.uint16
         return input
 
+    @staticmethod
+    def fluid_dtype_to_np_dtype(self, dtype):
+        """
+        See above, convert the dtype to normal type.
+        """
+        if dtype == np.uint16:
+            dtype = np.float16
+        return dtype
+
+    @staticmethod
+    def np_value_to_fluid_value(input):
+        if input.dtype == np.float16:
+            input = input.view(np.uint16)
+        return input
+
     def _get_gradient(self,
                       input_to_check,
                       place,
@@ -457,9 +509,9 @@ class OpTest(unittest.TestCase):
             if isinstance(place, fluid.CUDAPlace(0)):
                 use_cuda = True
             executor = fluid.ParallelExecutor(
-                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+                use_cuda=use_cuda, loss_name=loss.name, main_program=prog)
         else:
             executor = Executor(place)
-        return map(np.array,
-                   executor.run(prog, feed_dict, fetch_list,
-                                return_numpy=False))
+        return list(
+            map(np.array,
+                executor.run(prog, feed_dict, fetch_list, return_numpy=False)))
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index fcf86cc583..67c35e9de7 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -91,7 +91,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             first_loss, = run_executor(
                 exe=exe, feed=feed_dict, fetch_list=[loss.name])
 
-            for i in xrange(iter):
+            for i in range(iter):
                 run_executor(exe=exe, feed=feed_dict, fetch_list=[])
 
             last_loss, = run_executor(
@@ -99,8 +99,8 @@ class TestParallelExecutorBase(unittest.TestCase):
             end = time.time()
 
             if batch_size is not None:
-                print "%.4f Instance per second" % (
-                    (batch_size * iter + 2) / (end - begin))
+                print("%.4f Instance per second" % (
+                    (batch_size * iter + 2) / (end - begin)))
 
             avg_last_loss_val = np.array(last_loss).mean()
             avg_first_loss_val = np.array(first_loss).mean()
@@ -108,6 +108,6 @@ class TestParallelExecutorBase(unittest.TestCase):
                     float(avg_first_loss_val)):
                 sys.exit("got NaN loss, training failed.")
 
-            print first_loss, last_loss
+            print(first_loss, last_loss)
             # self.assertGreater(first_loss[0], last_loss[0])
             return first_loss, last_loss
diff --git a/python/paddle/fluid/tests/unittests/test_accuracy_op.py b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
index 212a87e529..db1861fd10 100644
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -26,7 +26,7 @@ class TestAccuracyOp(OpTest):
         label = np.random.randint(0, 2, (n, 1))
         self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
         num_correct = 0
-        for rowid in xrange(n):
+        for rowid in range(n):
             for ele in indices[rowid]:
                 if ele == label[rowid]:
                     num_correct += 1
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 5ed387fb12..34f9cf0620 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -313,9 +313,9 @@ class TestAbs(OpTest):
         self.init_dtype()
 
         x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
-        # Because we set delta = 0.005 in caculating numeric gradient,
+        # Because we set delta = 0.005 in calculating numeric gradient,
         # if x is too small, such as 0.002, x_neg will be -0.003
-        # x_pos will be 0.007, so the numeric gradient is unaccurate.
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
         # we should avoid this
         x[np.abs(x) < 0.005] = 0.02
         out = np.abs(x)
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 3c65f3d44a..fa4b39879c 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -273,7 +273,7 @@ class TestSparseAdamOp(unittest.TestCase):
         self.setup(scope, place)
 
         op_args = dict()
-        for key, np_array in self.dense_inputs.iteritems():
+        for key, np_array in self.dense_inputs.items():
             var = scope.var(key).get_tensor()
             var.set(np_array, place)
             op_args[key] = key
@@ -290,7 +290,7 @@ class TestSparseAdamOp(unittest.TestCase):
         adam_op = Operator("adam", **op_args)
         adam_op.run(scope, place)
 
-        for key, np_array in self.outputs.iteritems():
+        for key, np_array in self.outputs.items():
             out_var = scope.var(key).get_tensor()
             actual = np.array(out_var)
             actual = actual.reshape([actual.size])
diff --git a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
index a49e9035a4..0000fb0958 100644
--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
@@ -80,8 +80,9 @@ class TestArrayReadWrite(unittest.TestCase):
 
         append_backward(total_sum_scaled)
 
-        g_vars = map(default_main_program().global_block().var,
-                     [each_x.name + "@GRAD" for each_x in x])
+        g_vars = list(
+            map(default_main_program().global_block().var,
+                [each_x.name + "@GRAD" for each_x in x]))
         g_out = [
             item.sum()
             for item in exe.run(
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index fcb2612326..f805fdc35f 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -415,7 +415,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
             self.__assert_close(scale_grad, out[6], "scale_grad")
             self.__assert_close(bias_grad, out[7], "bias_grad")
 
-            print "op test forward passed: ", str(place), data_layout
+            print("op test forward passed: ", str(place), data_layout)
 
         places = [core.CPUPlace()]
 
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index 167451edd8..e8283fc942 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -59,8 +59,7 @@ class BeamSearchOpTester(unittest.TestCase):
             np.allclose(
                 np.array(selected_scores),
                 np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
-        self.assertEqual(selected_ids.lod(),
-                         [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]])
+        self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index d5bd726c4a..ceeca25b74 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -48,7 +48,7 @@ def bipartite_match(distance, match_indices, match_dist):
 
 def argmax_match(distance, match_indices, match_dist, threshold):
     r, c = distance.shape
-    for j in xrange(c):
+    for j in range(c):
         if match_indices[j] != -1:
             continue
         col_dist = distance[:, j]
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
index 23932194f0..354110f1f9 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
@@ -63,7 +63,7 @@ class TestChunkEvalOp(OpTest):
         # generate chunk beginnings
         chunk_begins = sorted(
             np.random.choice(
-                range(starts[-1]), num_chunks, replace=False))
+                list(range(starts[-1])), num_chunks, replace=False))
         seq_chunk_begins = []
         begin_idx = 0
         # divide chunks into sequences
@@ -93,7 +93,7 @@ class TestChunkEvalOp(OpTest):
                                   self.num_infer_chunks + self.num_label_chunks
                                   - self.num_correct_chunks)
         correct_chunks = np.random.choice(
-            range(len(chunks)), self.num_correct_chunks, replace=False)
+            list(range(len(chunks))), self.num_correct_chunks, replace=False)
         infer_chunks = np.random.choice(
             [x for x in range(len(chunks)) if x not in correct_chunks],
             self.num_infer_chunks - self.num_correct_chunks,
@@ -138,7 +138,8 @@ class TestChunkEvalOp(OpTest):
         infer.fill(self.num_chunk_types * self.num_tag_types)
         label = np.copy(infer)
         starts = np.random.choice(
-            range(1, self.batch_size), self.num_sequences - 1,
+            list(range(1, self.batch_size)),
+            self.num_sequences - 1,
             replace=False).tolist()
         starts.extend([0, self.batch_size])
         starts = sorted(starts)
diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py
index d9f83905e6..77869a1242 100644
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
@@ -39,7 +39,7 @@ class ConditionalBlockTest(unittest.TestCase):
         x = numpy.random.random(size=(10, 1)).astype('float32')
 
         outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
-        print outs
+        print(outs)
         loss = layers.mean(out)
         append_backward(loss=loss)
         outs = exe.run(
@@ -47,7 +47,7 @@ class ConditionalBlockTest(unittest.TestCase):
             fetch_list=[
                 default_main_program().block(0).var(data.name + "@GRAD")
             ])[0]
-        print outs
+        print(outs)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
index db6be21baa..d0de7ad52c 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
@@ -20,16 +20,19 @@ from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
 class TestMKLDNN(TestConv2dOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 class TestMKLDNNWithPad(TestWithPad):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 class TestMKLDNNWithStride(TestWithStride):
     def init_kernel_type(self):
         self.use_mkldnn = True
+        self.data_format = "NCHW"
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index a478649541..bb1cd87d61 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -66,6 +66,7 @@ class TestConv2dOp(OpTest):
         self.op_type = "conv2d"
         self.use_cudnn = False
         self.use_mkldnn = False
+        self.data_format = "AnyLayout"
         self.dtype = np.float32
         self.init_kernel_type()
         self.init_group()
@@ -93,7 +94,8 @@ class TestConv2dOp(OpTest):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'use_mkldnn': self.use_mkldnn
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format
         }
         self.outputs = {'Output': output}
 
@@ -101,59 +103,35 @@ class TestConv2dOp(OpTest):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_output_with_place(place, atol=1e-5)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['Input', 'Filter']),
-                'Output',
-                max_relative_error=0.02)
-        else:
-            self.check_grad(
-                set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']))
-        else:
-            self.check_grad(
-                ['Input'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Filter']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Input']))
-        else:
-            self.check_grad(
-                ['Filter'],
-                'Output',
-                max_relative_error=0.02,
-                no_grad_set=set(['Input']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Filter'],
+            'Output',
+            max_relative_error=0.02,
+            no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.pad = [0, 0]
diff --git a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
index 5d4d244f43..9fdb7baa90 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
@@ -22,8 +22,8 @@ def conv_shift_forward(x, y):
     M = x.shape[1]
     N = y.shape[1]
     y_half_width = (N - 1) / 2
-    for i in xrange(M):
-        for j in xrange(N):
+    for i in range(M):
+        for j in range(N):
             out[:, i] += x[:, (i + j + M - y_half_width) % M] * y[:, j]
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
index 5e6f9a20a9..07c89eefc3 100644
--- a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
@@ -18,7 +18,7 @@ import paddle.fluid.layers as layers
 
 class TestDocString(unittest.TestCase):
     def test_layer_doc_string(self):
-        print layers.dropout.__doc__
+        print(layers.dropout.__doc__)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
index aa09b0ea44..951282e8ba 100644
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -21,7 +21,7 @@ import numpy as np
 class TestDataBalance(unittest.TestCase):
     def prepare_data(self):
         def fake_data_generator():
-            for n in xrange(self.total_ins_num):
+            for n in range(self.total_ins_num):
                 yield np.ones((3, 4)) * n, n
 
         # Prepare data
@@ -41,7 +41,7 @@ class TestDataBalance(unittest.TestCase):
 
     def prepare_lod_data(self):
         def fake_data_generator():
-            for n in xrange(1, self.total_ins_num + 1):
+            for n in range(1, self.total_ins_num + 1):
                 d1 = (np.ones((n, 3)) * n).astype('float32')
                 d2 = (np.array(n).reshape((1, 1))).astype('int32')
                 yield d1, d2
@@ -58,9 +58,9 @@ class TestDataBalance(unittest.TestCase):
                             (0, 1))
                     ]
                     lod = [0]
-                    for _ in xrange(self.batch_size):
+                    for _ in range(self.batch_size):
                         try:
-                            ins = generator.next()
+                            ins = next(generator)
                         except StopIteration:
                             eof = True
                             break
diff --git a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
index a3bf7b544b..868bcca881 100644
--- a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
@@ -39,7 +39,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):
             self.assertTrue(i.is_int())
             self.assertEqual(10, i.get_int())
 
-        for _ in xrange(10):
+        for _ in range(10):
             scoped_function(__new_scope__)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index 05d3367ad8..8b66d1b270 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -176,7 +176,7 @@ class TestDetectionMAPOp(OpTest):
             true_pos[label].append([score, tp])
             false_pos[label].append([score, fp])
 
-        for (label, label_pos_num) in label_count.items():
+        for (label, label_pos_num) in list(label_count.items()):
             if label_pos_num == 0 or label not in true_pos: continue
             label_true_pos = true_pos[label]
             label_false_pos = false_pos[label]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
new file mode 100644
index 0000000000..58cfd4e1fd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+import unittest
+import os
+import sys
+import signal
+import subprocess
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 2
+        self._pservers = 2
+        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._python_interp = "python"
+
+    def start_pserver(self, model_file):
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return ps0_proc, ps1_proc
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 50
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error as e:
+                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
+                                 (e, retry_times))
+                retry_times -= 1
+
+    def check_with_place(self, model_file, delta=1e-3):
+        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
+        required_envs = {
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_cudnn_deterministic": "1"
+        }
+        # Run local to get a base line
+        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        env_local.update(required_envs)
+        local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
+            (self._python_interp, model_file,
+             "127.0.0.1:1234", "127.0.0.1:1234", 1)
+        local_proc = subprocess.Popen(
+            local_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env_local)
+        local_proc.wait()
+        out, err = local_proc.communicate()
+        local_ret = out
+        sys.stderr.write('local_loss: %s\n' % local_ret)
+        sys.stderr.write('local_stderr: %s\n' % err)
+
+        # Run dist train to compare with local results
+        ps0, ps1 = self.start_pserver(model_file)
+        self._wait_ps_ready(ps0.pid)
+        self._wait_ps_ready(ps1.pid)
+
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
+             self._trainers)
+        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \
+            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
+             self._trainers)
+
+        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env0.update(required_envs)
+        env1.update(required_envs)
+        FNULL = open(os.devnull, 'w')
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env1)
+
+        tr0_proc.wait()
+        tr1_proc.wait()
+        out, err = tr0_proc.communicate()
+        sys.stderr.write('dist_stderr: %s\n' % err)
+        loss_data0 = out
+        sys.stderr.write('dist_loss: %s\n' % loss_data0)
+        lines = loss_data0.split("\n")
+        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
+        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
+
+        local_lines = local_ret.split("\n")
+        local_first_loss = eval(local_lines[0])[0]
+        local_last_loss = eval(local_lines[1])[0]
+
+        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
+        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)
+
+        # check tr0_out
+        # FIXME: ensure the server process is killed
+        # replace with ps0.terminate()
+        os.kill(ps0.pid, signal.SIGKILL)
+        os.kill(ps1.pid, signal.SIGKILL)
+        FNULL.close()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index ad2d57f7c5..a6fcbd977f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -25,6 +25,7 @@ import unittest
 from multiprocessing import Process
 import os
 import signal
+from functools import reduce
 
 SEED = 1
 DTYPE = "float32"
@@ -172,12 +173,12 @@ class TestDistMnist(unittest.TestCase):
         exe.run(fluid.default_startup_program())
 
         feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
+            var for var in trainer_prog.global_block().vars.values()
             if var.is_data
         ]
 
         feeder = fluid.DataFeeder(feed_var_list, place)
-        for pass_id in xrange(10):
+        for pass_id in range(10):
             for batch_id, data in enumerate(train_reader()):
                 exe.run(trainer_prog, feed=feeder.feed(data))
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index e3e7036f08..f3a5fd6985 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -11,111 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import numpy as np
-import argparse
-import time
-import math
-
 import unittest
-import os
-import signal
-import subprocess
-
-
-class TestDistSeResneXt2x2(unittest.TestCase):
-    def setUp(self):
-        self._trainers = 2
-        self._pservers = 2
-        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
-        self._python_interp = "python"
-
-    def start_pserver(self):
-        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
-        ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
-
-        ps0_proc = subprocess.Popen(
-            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        ps1_proc = subprocess.Popen(
-            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        return ps0_proc, ps1_proc
-
-    def _wait_ps_ready(self, pid):
-        retry_times = 20
-        while True:
-            assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(3)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                retry_times -= 1
-
-    def non_test_with_place(self):
-        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
-        required_envs = {
-            "PATH": os.getenv("PATH"),
-            "PYTHONPATH": os.getenv("PYTHONPATH"),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
-            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
-        }
-        # Run local to get a base line
-        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
-        env_local.update(required_envs)
-        local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
-            (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
-        local_proc = subprocess.Popen(
-            local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local)
-        local_proc.wait()
-        local_ret = local_proc.stdout.read()
-
-        # Run dist train to compare with local results
-        ps0, ps1 = self.start_pserver()
-        self._wait_ps_ready(ps0.pid)
-        self._wait_ps_ready(ps1.pid)
-
-        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
-        tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
-            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
-
-        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
-        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
-        env0.update(required_envs)
-        env1.update(required_envs)
-        FNULL = open(os.devnull, 'w')
-
-        tr0_proc = subprocess.Popen(
-            tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0)
-        tr1_proc = subprocess.Popen(
-            tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1)
-
-        tr0_proc.wait()
-        tr1_proc.wait()
-        loss_data0 = tr0_proc.stdout.read()
-        lines = loss_data0.split("\n")
-        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
-        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
-
-        local_lines = local_ret.split("\n")
-        local_first_loss = eval(local_lines[0])[0]
-        local_last_loss = eval(local_lines[1])[0]
+from test_dist_base import TestDistBase
 
-        self.assertAlmostEqual(local_first_loss, dist_first_loss)
-        self.assertAlmostEqual(local_last_loss, dist_last_loss)
 
-        # check tr0_out
-        # FIXME: ensure the server process is killed
-        # replace with ps0.terminate()
-        os.kill(ps0.pid, signal.SIGKILL)
-        os.kill(ps1.pid, signal.SIGKILL)
-        FNULL.close()
+class TestDistSeResneXt2x2(TestDistBase):
+    def test_se_resnext(self):
+        self.check_with_place("dist_se_resnext.py")
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
new file mode 100644
index 0000000000..68cd35d751
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistTransformer2x2(TestDistBase):
+    def test_transformer(self):
+        # TODO(paddle-dev): check if the delta is OK.
+        # Usually start around ~8000 and converge to ~5000
+        self.check_with_place("dist_transformer.py", delta=400)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 9dbef0693b..3bc5e6ada5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -51,17 +51,17 @@ class TranspilerTest(unittest.TestCase):
         self.origin_prog = main.clone()
         return main
 
-    def get_trainer(self, config=None):
-        t = self._transpiler_instance(config)
+    def get_trainer(self, config=None, sync_mode=True):
+        t = self._transpiler_instance(config, sync_mode)
         return t.get_trainer_program()
 
-    def get_pserver(self, ep, config=None):
-        t = self._transpiler_instance(config)
+    def get_pserver(self, ep, config=None, sync_mode=True):
+        t = self._transpiler_instance(config, sync_mode)
         pserver = t.get_pserver_program(ep)
         startup = t.get_startup_program(ep, pserver)
         return pserver, startup
 
-    def _transpiler_instance(self, config=None):
+    def _transpiler_instance(self, config=None, sync_mode=True):
         if not self.transpiler:
             main = self.get_main_program()
             self.transpiler = fluid.DistributeTranspiler(config=config)
@@ -69,13 +69,23 @@ class TranspilerTest(unittest.TestCase):
                 self.trainer_id,
                 program=main,
                 pservers=self.pserver_eps,
-                trainers=self.trainers)
+                trainers=self.trainers,
+                sync_mode=sync_mode)
 
         return self.transpiler
 
+    def transpiler_test_impl(self):
+        pass
 
-class TestBasicModel(TranspilerTest):
     def test_transpiler(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            self.transpiler_test_impl()
+
+
+class TestBasicModel(TranspilerTest):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         pserver2, startup2 = self.get_pserver(self.pserver2_ep)
 
@@ -123,7 +133,7 @@ class TestBasicModel(TranspilerTest):
 
 
 class TestBasicModelWithLargeBlockSize(TranspilerTest):
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         config = fluid.DistributeTranspilerConfig()
         config.min_block_size = 1048576
 
@@ -148,10 +158,10 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
                          ["sum", "scale", "sgd"])
         # confirm startup program
         self.assertEqual([op.type for op in startup.global_block().ops],
-                         ["fill_constant", "fill_constant", "fill_constant"])
+                         ["fill_constant", "fill_constant"])
         # the variable #fc_w will be split into two blocks
         fc_w_var = startup2.global_block().var("fc_w")
-        self.assertEqual(fc_w_var.shape, (1000L, 1000L))
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
         # all parameters should be optimized on pserver
 
         pserver_params = []
@@ -177,16 +187,16 @@ class TestNoSliceVar(TranspilerTest):
     def setUp(self):
         super(TestNoSliceVar, self).setUp()
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         config = fluid.DistributeTranspilerConfig()
         config.slice_var_up = False
 
         _, startup = self.get_pserver(self.pserver1_ep, config)
         _, startup2 = self.get_pserver(self.pserver2_ep, config)
 
-        if startup.global_block().vars.has_key("fc_w"):
+        if "fc_w" in startup.global_block().vars:
             fc_w_var = startup.global_block().vars["fc_w"]
-        elif startup2.global_block().vars.has_key("fc_w"):
+        elif "fc_w" in startup2.global_block().vars:
             fc_w_var = startup2.global_block().vars["fc_w"]
 
         self.assertEqual(fc_w_var.shape, (1000, 1000))
@@ -212,7 +222,7 @@ class TestLRDecay(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -242,7 +252,7 @@ class TestLRDecayConditional(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -291,7 +301,7 @@ class TestL2Decay(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -326,7 +336,7 @@ class TestL2DecayWithPiecewise(TranspilerTest):
         sgd_optimizer.minimize(avg_cost)
         return
 
-    def test_transpiler(self):
+    def transpiler_test_impl(self):
         pserver, startup = self.get_pserver(self.pserver1_ep)
         trainer = self.get_trainer()
 
@@ -350,5 +360,181 @@ class TestL2DecayWithPiecewise(TranspilerTest):
             ["sum", "scale", "scale", "elementwise_add", "momentum"])
 
 
+class TestDistLookupTableBase(TranspilerTest):
+    def network_with_table(self, is_sparse, is_distributed):
+        def emb_pool(ids):
+            table_size = 1000
+            emb_size = 64
+            emb = fluid.layers.embedding(
+                input=ids,
+                size=[table_size, emb_size],
+                dtype='float32',
+                param_attr='shared_w',  # share parameter
+                is_sparse=is_sparse,
+                is_distributed=is_distributed)
+            pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
+            return pool
+
+        title_ids = fluid.layers.data(
+            name='title_ids', shape=[1], dtype='int64', lod_level=1)
+        brand_ids = fluid.layers.data(
+            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
+        title_emb = emb_pool(title_ids)
+        brand_emb = emb_pool(brand_ids)
+        fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1)
+        predict = fluid.layers.fc(input=fc0,
+                                  size=2,
+                                  act=None,
+                                  param_attr=fluid.ParamAttr(name='fc_w'),
+                                  bias_attr=fluid.ParamAttr(name='fc_b'))
+
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(cost)
+        optimizer = fluid.optimizer.Adam(learning_rate=0.003)
+        optimizer.minimize(avg_cost)
+
+
+class TestLocalLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 3)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "adam", "scale", "scale"])
+
+        trainer = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
+            'fill_constant', 'mean_grad', 'cross_entropy_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_selected_rows', 'send',
+            'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestDistLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["lookup_sparse_table"])
+        # 4 prefetch -> lookup_sparse_table for data1
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
+            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_ids', 'send', 'send_barrier', 'recv', 'recv',
+            'fetch_barrier'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestAsyncLocalLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
+
+        self.assertEqual(len(pserver1.blocks), 3)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["adam", "scale", "scale"])
+
+        trainer = self.get_trainer(config)
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
+            'fill_constant', 'mean_grad', 'cross_entropy_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv',
+            'recv', 'recv', 'concat'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
+class TestAsyncDistLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        self.network_with_table(is_sparse=True, is_distributed=True)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
+
+        self.assertEqual(len(pserver1.blocks), 6)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"])
+        # 3 prefetch -> lookup_sparse_table for data0
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["lookup_sparse_table"])
+        # 4 prefetch -> lookup_sparse_table for data1
+        self.assertEqual([op.type for op in pserver1.blocks[4].ops],
+                         ["lookup_sparse_table"])
+        # 5 save table
+        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
+
+        trainer = self.get_trainer(config)
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
+            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
+            'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
+            'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_ids', 'send', 'recv', 'recv'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
index 712fd5849d..4bb3998f89 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -183,12 +183,12 @@ class TestDistMnist(unittest.TestCase):
             exec_strategy=exec_strategy)
 
         feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
+            var for var in trainer_prog.global_block().vars.values()
             if var.is_data
         ]
 
         feeder = fluid.DataFeeder(feed_var_list, place)
-        for pass_id in xrange(10):
+        for pass_id in range(10):
             for batch_id, data in enumerate(train_reader()):
                 avg_loss_np = train_exe.run(feed=feeder.feed(data),
                                             fetch_list=[avg_cost.name])
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 4448de8839..fdc6adc93b 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -135,7 +135,7 @@ class TestDynRNN(unittest.TestCase):
         loss_0 = exe.run(main_program,
                          feed=feeder.feed(data),
                          fetch_list=[loss])[0]
-        for _ in xrange(100):
+        for _ in range(100):
             val = exe.run(main_program,
                           feed=feeder.feed(data),
                           fetch_list=[loss])[0]
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 0f289af284..7756885166 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -61,13 +61,13 @@ class BaseRNN(object):
         self.num_seq = num_seq
         self.inputs = collections.defaultdict(list)
 
-        for _ in xrange(num_seq):
+        for _ in range(num_seq):
             seq_len = random.randint(1, max_seq_len - 1)
             for iname in ins:
                 ishape = ins[iname].get('shape', None)
                 idtype = ins[iname].get('dtype', 'float32')
                 lst = []
-                for _ in xrange(seq_len):
+                for _ in range(seq_len):
                     lst.append(numpy.random.random(size=ishape).astype(idtype))
                 self.inputs[iname].append(lst)
 
@@ -96,16 +96,16 @@ class BaseRNN(object):
         for out in self.outputs:
             retv[out] = []
 
-        for seq_id in xrange(self.num_seq):
+        for seq_id in range(self.num_seq):
             for mname in self.mems:
                 self.mems[mname].reset()
             for out in self.outputs:
                 self.outputs[out].next_sequence()
 
-            iname0 = self.inputs.keys()[0]
+            iname0 = list(self.inputs.keys())[0]
             seq_len = len(self.inputs[iname0][seq_id])
 
-            for step_id in xrange(seq_len):
+            for step_id in range(seq_len):
                 xargs = dict()
 
                 for iname in self.inputs:
@@ -138,7 +138,7 @@ class BaseRNN(object):
         for iname in self.inputs:
             lod = []
             np_flatten = []
-            for seq_id in xrange(len(self.inputs[iname])):
+            for seq_id in range(len(self.inputs[iname])):
                 seq_len = len(self.inputs[iname][seq_id])
                 lod.append(seq_len)
                 np_flatten.extend(self.inputs[iname][seq_id])
@@ -159,8 +159,8 @@ class BaseRNN(object):
                              " which is not matrix")
         g = numpy.zeros(shape=p.shape, dtype=p.dtype)
 
-        for i in xrange(p.shape[0]):
-            for j in xrange(p.shape[1]):
+        for i in range(p.shape[0]):
+            for j in range(p.shape[1]):
                 o = p[i][j]
                 p[i][j] += delta
                 pos = self._exe_mean_out_()
@@ -184,7 +184,7 @@ class BaseRNN(object):
                 if len(item.shape) != 1:
                     raise ValueError("Not support")
 
-                for i in xrange(len(item)):
+                for i in range(len(item)):
                     o = item[i]
                     item[i] += delta
                     pos = self._exe_mean_out_()
@@ -198,14 +198,14 @@ class BaseRNN(object):
         if not return_one_tensor:
             return grad
 
-        for i in xrange(len(grad)):
+        for i in range(len(grad)):
             grad[i] = numpy.concatenate(grad[i])
         grad = numpy.concatenate(grad)
         return grad
 
     def _exe_mean_out_(self):
         outs = self.exe()
-        return numpy.array([o.mean() for o in outs.itervalues()]).mean()
+        return numpy.array([o.mean() for o in outs.values()]).mean()
 
 
 class SeedFixedTestCase(unittest.TestCase):
@@ -274,13 +274,14 @@ class TestSimpleMul(SeedFixedTestCase):
 
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
-        out, w_g, i_g = map(numpy.array,
-                            exe.run(feed=py_rnn.to_feed(cpu),
-                                    fetch_list=[
-                                        out, self.PARAM_NAME + "@GRAD",
-                                        self.DATA_NAME + "@GRAD"
-                                    ],
-                                    return_numpy=False))
+        out, w_g, i_g = list(
+            map(numpy.array,
+                exe.run(feed=py_rnn.to_feed(cpu),
+                        fetch_list=[
+                            out, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
+                            "@GRAD"
+                        ],
+                        return_numpy=False)))
         out_by_python = py_rnn.exe()[self.OUT_NAME]
         self.assertTrue(numpy.allclose(out, out_by_python))
         w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
@@ -351,14 +352,15 @@ class TestSimpleMulWithMemory(SeedFixedTestCase):
         cpu = fluid.CPUPlace()
         exe = fluid.Executor(cpu)
         feed = py_rnn.to_feed(cpu)
-        last_np, w_g, i_g = map(numpy.array,
-                                exe.run(feed=feed,
-                                        fetch_list=[
-                                            last, self.PARAM_NAME + "@GRAD",
-                                            self.DATA_NAME + "@GRAD"
-                                        ],
-                                        return_numpy=False))
-        last_by_py, = py_rnn.exe().values()
+        last_np, w_g, i_g = list(
+            map(numpy.array,
+                exe.run(feed=feed,
+                        fetch_list=[
+                            last, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
+                            "@GRAD"
+                        ],
+                        return_numpy=False)))
+        last_by_py, = list(py_rnn.exe().values())
         w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
         self.assertTrue(numpy.allclose(last_np, last_by_py))
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index 31af124572..d182889a97 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -67,7 +67,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
     def _lodtensor_to_ndarray(self, lod_tensor):
         dims = lod_tensor.shape()
         ndarray = np.zeros(shape=dims).astype('float32')
-        for i in xrange(np.product(dims)):
+        for i in range(np.product(dims)):
             ndarray.ravel()[i] = lod_tensor._get_float_element(i)
         return ndarray, lod_tensor.recursive_sequence_lengths()
 
@@ -114,7 +114,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
                 shape=[1], dtype='int64', value=0)
             step_idx.stop_gradient = True
 
-            for i in xrange(self._max_sequence_len):
+            for i in range(self._max_sequence_len):
                 step_out = fluid.layers.array_read(static_input_out_array,
                                                    step_idx)
                 step_out.stop_gradient = True
@@ -140,27 +140,27 @@ class TestDyRnnStaticInput(unittest.TestCase):
         static_lod = self.static_input_tensor.recursive_sequence_lengths()
         static_sliced = []
         cur_offset = 0
-        for i in xrange(len(static_lod[0])):
+        for i in range(len(static_lod[0])):
             static_sliced.append(self.static_input_data[cur_offset:(
                 cur_offset + static_lod[0][i])])
             cur_offset += static_lod[0][i]
         static_seq_len = static_lod[0]
         static_reordered = []
-        for i in xrange(len(x_sorted_indices)):
+        for i in range(len(x_sorted_indices)):
             static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
         static_seq_len_reordered = [
             static_seq_len[x_sorted_indices[i]]
-            for i in xrange(len(x_sorted_indices))
+            for i in range(len(x_sorted_indices))
         ]
 
         static_step_outs = []
         static_step_lods = []
 
-        for i in xrange(self._max_sequence_len):
+        for i in range(self._max_sequence_len):
             end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
             lod = []
             total_len = 0
-            for i in xrange(end):
+            for i in range(end):
                 lod.append(static_seq_len_reordered[i])
                 total_len += lod[-1]
             static_step_lods.append([lod])
@@ -174,7 +174,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
         static_step_outs = self.build_graph(only_forward=True)
         self.exe.run(framework.default_startup_program())
         expected_outs, expected_lods = self.get_expected_static_step_outs()
-        for i in xrange(self._max_sequence_len):
+        for i in range(self._max_sequence_len):
             step_out, lod = self.fetch_value(static_step_outs[i])
             self.assertTrue(np.allclose(step_out, expected_outs[i]))
             self.assertTrue(np.allclose(lod, expected_lods[i]))
@@ -189,7 +189,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
         numeric_gradients = np.zeros(shape=static_input_shape).astype('float32')
         # calculate numeric gradients
         tensor_size = np.product(static_input_shape)
-        for i in xrange(tensor_size):
+        for i in range(tensor_size):
             origin = self.static_input_tensor._get_float_element(i)
             x_pos = origin + self._delta
             self.static_input_tensor._set_float_element(i, x_pos)
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
index c6f45381af..6f35004489 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
@@ -26,7 +26,7 @@ class TestElementWiseAddOp(unittest.TestCase):
         def test_with_place(place):
             out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
             x_grad = out_grad
-            sum_axis = range(0, len(self.x.shape))
+            sum_axis = list(range(0, len(self.x.shape)))
             del sum_axis[self.axis]
             y_grad = np.sum(out_grad, axis=tuple(sum_axis))
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index acf652d3fb..1854232194 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -20,8 +20,8 @@ class TestElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': np.random.uniform(0.1, 1, [2, 3]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3]).astype("float32")
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
 
diff --git a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
new file mode 100644
index 0000000000..6a41c44fe6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
@@ -0,0 +1,58 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+
+
+class TestExtractRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Variable
+        feature_len = 12
+        rows = [0, 4, 4, 7]
+        np_array = np.ones((len(rows), feature_len)).astype("float32")
+
+        in_x = scope.var('X').get_selected_rows()
+        in_x.set_height(len(rows))
+        in_x.set_rows(rows)
+        in_x_tensor = in_x.get_tensor()
+        in_x_tensor.set(np_array, place)
+
+        # create Out Variable
+        out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        extract_rows_op = Operator("extract_rows", X='X', Out='Out')
+        extract_rows_op.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+        result_array = [ele[0] for ele in result_array]
+        assert result_array == rows
+
+    def test_concat_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py
new file mode 100644
index 0000000000..f8692ce2ea
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+class TestFlattenOp(OpTest):
+    def setUp(self):
+        self.op_type = "flatten"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 5)
+        self.axis = 1
+        self.new_shape = (3, 20)
+
+    def init_attrs(self):
+        self.attrs = {"axis": self.axis}
+
+
+class TestFlattenOp(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 3)
+        self.axis = 0
+        self.new_shape = (1, 36)
+
+
+class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 3)
+        self.new_shape = (3, 12)
+
+    def init_attrs(self):
+        self.attrs = {}
+
+
+class TestFlattenOpSixDims(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 3, 2, 4, 4)
+        self.axis = 4
+        self.new_shape = (36, 16)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
new file mode 100644
index 0000000000..ec0a939e9e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
@@ -0,0 +1,818 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+# scale + add
+#   TestElementwiseAddOp
+#   TestFusedOperatorsOp_scalar
+#   TestFusedOperatorsOp_scalar2
+#   TestFusedOperatorsOp_Vector
+#   TestFusedOperatorsOp_broadcast_0
+#   TestFusedOperatorsOp_broadcast_1
+#   TestFusedOperatorsOp_broadcast_2
+#   TestFusedOperatorsOp_broadcast_3
+#   TestFusedOperatorsOp_broadcast_4
+#   TestFusedOperatorsOp_rowwise_add_0
+#   TestFusedOperatorsOp_rowwise_add_1
+#   TestFusedOperatorsOp_channelwise_add
+
+
+class TestElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.op_type = "fused_elemwise_activation"
+        self.dtype = np.float32
+        self.axis = -1
+
+        self.init_axis()
+        self.init_dtype()
+        self.init_input()
+        self.init_output()
+        self.init_attr()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.outputs = {'Out': self.out}
+
+    def init_input(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["scale", "elementwise_add"]
+        }
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestFusedOperatorsOp_scalar(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_scalar2(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_Vector(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.random((32, )).astype(self.dtype)
+        self.y = np.random.random((32, )).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_0(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(2, 1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_1(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_2(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 1, 4)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_3(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 4, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_broadcast_4(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(2, 1, 1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 3, 4)) * self.scale
+
+
+class TestFusedOperatorsOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(2, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y.reshape(1, 1)) * self.scale
+
+
+class TestFusedOperatorsOp_channelwise_add(TestElementwiseAddOp):
+    def init_input(self):
+        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = (self.x + self.y) * self.scale
+
+
+# add + scale
+#   TestElementwiseAddOp_f_add_scale
+#   TestFusedOperatorsOp_scalar_f_add_scale
+#   TestFusedOperatorsOp_scalar2_f_add_scale
+#   TestFusedOperatorsOp_Vector_f_add_scale
+#   TestFusedOperatorsOp_broadcast_0_f_add_scale
+#   TestFusedOperatorsOp_broadcast_1_f_add_scale
+#   TestFusedOperatorsOp_broadcast_2_f_add_scale
+#   TestFusedOperatorsOp_broadcast_3_f_add_scale
+#   TestFusedOperatorsOp_broadcast_4_f_add_scale
+#   TestFusedOperatorsOp_rowwise_add_0_f_add_scale
+#   TestFusedOperatorsOp_rowwise_add_1_f_add_scale
+#   TestFusedOperatorsOp_channelwise_add_f_add_scale
+
+
+class TestFusedOperatorsOp_f_add_scale(TestElementwiseAddOp):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_add_scale(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_add_scale(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_add_scale(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_add_scale(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(2, 1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_add_scale(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_add_scale(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 1, 4) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_add_scale(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 4, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_add_scale(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y.reshape(2, 1, 1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_add_scale(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.1
+        self.out = self.x + self.y.reshape(1, 3, 4) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_add_scale(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y.reshape(1, 1) * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_add_scale(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.scale = 0.2
+        self.out = self.x + self.y * self.scale
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'scale': self.scale,
+            'functor_list': ["elementwise_add", "scale"]
+        }
+
+
+# add + relu
+#   TestElementwiseAddOp_f_add_relu
+#   TestFusedOperatorsOp_scalar_f_add_relu
+#   TestFusedOperatorsOp_scalar2_f_add_relu
+#   TestFusedOperatorsOp_Vector_f_add_relu
+#   TestFusedOperatorsOp_broadcast_0_f_add_relu
+#   TestFusedOperatorsOp_broadcast_1_f_add_relu
+#   TestFusedOperatorsOp_broadcast_2_f_add_relu
+#   TestFusedOperatorsOp_broadcast_3_f_add_relu
+#   TestFusedOperatorsOp_broadcast_4_f_add_relu
+#   TestFusedOperatorsOp_rowwise_add_0_f_add_relu
+#   TestFusedOperatorsOp_rowwise_add_1_f_add_relu
+#   TestFusedOperatorsOp_channelwise_add_f_add_relu
+
+
+class TestFusedOperatorsOp_f_add_relu(TestElementwiseAddOp):
+    def init_output(self):
+        # Copy from test_activation_op.py
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_add_relu(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_add_relu(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_add_relu(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_add_relu(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(2, 1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_add_relu(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_add_relu(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 1, 4), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_add_relu(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 4, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_add_relu(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(2, 1, 1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_add_relu(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 3, 4), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_add_relu(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y.reshape(1, 1), 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_add_relu(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.y[np.abs(self.y) < 0.005] = 0.02
+        self.out = self.x + np.maximum(self.y, 0)
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["elementwise_add", "relu"]
+        }
+
+
+# relu + add
+#   TestElementwiseAddOp_f_relu_add
+#   TestFusedOperatorsOp_scalar_f_relu_add
+#   TestFusedOperatorsOp_scalar2_f_relu_add
+#   TestFusedOperatorsOp_Vector_f_relu_add
+#   TestFusedOperatorsOp_broadcast_0_f_relu_add
+#   TestFusedOperatorsOp_broadcast_1_f_relu_add
+#   TestFusedOperatorsOp_broadcast_2_f_relu_add
+#   TestFusedOperatorsOp_broadcast_3_f_relu_add
+#   TestFusedOperatorsOp_broadcast_4_f_relu_add
+#   TestFusedOperatorsOp_rowwise_add_0_f_relu_add
+#   TestFusedOperatorsOp_rowwise_add_1_f_relu_add
+#   TestFusedOperatorsOp_channelwise_add_f_relu_add
+
+
+class TestFusedOperatorsOp_f_relu_add(TestElementwiseAddOp):
+    def init_output(self):
+        # Copy from test_activation_op.py
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_scalar_f_relu_add(TestFusedOperatorsOp_scalar):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_scalar2_f_relu_add(TestFusedOperatorsOp_scalar2):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_Vector_f_relu_add(TestFusedOperatorsOp_Vector):
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_0_f_relu_add(
+        TestFusedOperatorsOp_broadcast_0):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(2, 1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_1_f_relu_add(
+        TestFusedOperatorsOp_broadcast_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_2_f_relu_add(
+        TestFusedOperatorsOp_broadcast_2):
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 1, 4)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_3_f_relu_add(
+        TestFusedOperatorsOp_broadcast_3):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_broadcast_4_f_relu_add(
+        TestFusedOperatorsOp_broadcast_4):
+    def init_axis(self):
+        self.axis = 0
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_0_f_relu_add(
+        TestFusedOperatorsOp_rowwise_add_0):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 3, 4)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_rowwise_add_1_f_relu_add(
+        TestFusedOperatorsOp_rowwise_add_1):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_output(self):
+        self.out = self.x + self.y.reshape(1, 1)
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+class TestFusedOperatorsOp_channelwise_add_f_relu_add(
+        TestFusedOperatorsOp_channelwise_add):
+    def init_axis(self):
+        self.axis = -1
+
+    def init_output(self):
+        self.out = self.x + self.y
+        self.out = np.maximum(self.out, 0)
+        self.out[np.abs(self.out) < 0.005] = 0.02
+
+    def init_attr(self):
+        self.attrs = {
+            'axis': self.axis,
+            'functor_list': ["relu", "elementwise_add"]
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 8fbf156085..86a2c674d0 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -38,7 +38,7 @@ class TestGRUOp(OpTest):
         for i in range(len(seq_lens)):
             seq_starts.append(seq_starts[-1] + seq_lens[i])
         sorted_seqs = sorted(
-            range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
+            list(range(len(seq_lens))), lambda x, y: seq_lens[y] - seq_lens[x])
         num_batch = seq_lens[sorted_seqs[0]]
         for batch_idx in range(num_batch):
             idx_in_seq = []
@@ -74,15 +74,16 @@ class TestGRUOp(OpTest):
     def gru(self):
         input, lod = self.inputs['Input']
         w = self.inputs['Weight']
-        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+        b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros(
             (1, self.frame_size * 3))
         batch_gate = self.outputs['BatchGate']
         batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
         batch_hidden = self.outputs['BatchHidden']
         hidden = self.outputs['Hidden']
         idx_in_seq_list = self.idx_in_seq_list
-        h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key(
-            'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size))
+        h_p = self.inputs['H0'][
+            self.sorted_seqs] if 'H0' in self.inputs else np.zeros(
+                (len(idx_in_seq_list[0]), self.frame_size))
         num_batch = len(idx_in_seq_list)
         end_idx = 0
         for batch_idx in range(num_batch):
diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
index c56b1eefd3..87a9eba4d9 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
@@ -76,7 +76,7 @@ class TestGRUUnitOp(OpTest):
         x = self.inputs['Input']
         h_p = self.inputs['HiddenPrev']
         w = self.inputs['Weight']
-        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+        b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros(
             (1, frame_size * 3))
         g = x + np.tile(b, (batch_size, 1))
         w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index d090960c84..daa5da8d95 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -17,6 +17,8 @@ import numpy as np
 import math
 from op_test import OpTest
 
+np.random.seed(100)
+
 
 def find_latest_set(num):
     return 1 + int(math.floor(math.log(num, 2)))
diff --git a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
index 6ecfa9ea21..23b1ed957a 100644
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
@@ -43,7 +43,7 @@ class TestLayer(unittest.TestCase):
             hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
             fluid.layers.batch_norm(input=hidden2)
 
-        print str(main_program)
+        print(str(main_program))
 
     def test_dropout_layer(self):
         main_program = Program()
@@ -53,7 +53,7 @@ class TestLayer(unittest.TestCase):
                 name='pixel', shape=[3, 48, 48], dtype='float32')
             fluid.layers.dropout(x=images, dropout_prob=0.5)
 
-        print str(main_program)
+        print(str(main_program))
 
     def test_img_conv_group(self):
         main_program = Program()
@@ -65,7 +65,7 @@ class TestLayer(unittest.TestCase):
             conv1 = conv_block(images, 64, 2, [0.3, 0])
             conv_block(conv1, 256, 3, [0.4, 0.4, 0])
 
-        print str(main_program)
+        print(str(main_program))
 
     def test_elementwise_add_with_act(self):
         main_program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
index 51460cbb13..4cd203155f 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -48,7 +48,7 @@ class TestBook(unittest.TestCase):
 
         exe.run(init_program, feed={}, fetch_list=[])
 
-        for i in xrange(100):
+        for i in range(100):
             tensor_x = np.array(
                 [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
             tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 15a72cb605..b215e37986 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -27,12 +27,13 @@ class TestConstantInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.ConstantInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.ConstantInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -43,12 +44,13 @@ class TestConstantInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.ConstantInitializer(2.3))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.ConstantInitializer(2.3))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -61,12 +63,13 @@ class TestUniformInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -80,18 +83,19 @@ class TestUniformInitializer(unittest.TestCase):
         program = framework.Program()
         program.random_seed = 123
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer())
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer(seed=456))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param1",
+                initializer=initializer.UniformInitializer())
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param2",
+                initializer=initializer.UniformInitializer(seed=456))
         init_op = block.ops[1]
         self.assertEqual(init_op.attr("seed"), 123)
         init_op1 = block.ops[0]
@@ -102,12 +106,13 @@ class TestUniformInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -115,6 +120,25 @@ class TestUniformInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 123)
 
+    def test_uniform_initializer_two_op(self):
+        """Test uniform initializer with supplied attributes
+        """
+        program = framework.Program()
+        block = program.global_block()
+        for i in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.UniformInitializer(-4.2, float(i), 123))
+        self.assertEqual(len(block.ops), 1)
+        init_op0 = block.ops[0]
+        self.assertEqual(init_op0.type, 'uniform_random')
+        self.assertAlmostEqual(init_op0.attr('min'), -4.2, delta=DELTA)
+        self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA)
+        self.assertEqual(init_op0.attr('seed'), 123)
+
 
 class TestNormalInitializer(unittest.TestCase):
     def test_normal_initializer_default_value(self):
@@ -122,12 +146,13 @@ class TestNormalInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.NormalInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.NormalInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -140,12 +165,13 @@ class TestNormalInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.NormalInitializer(2.3, 1.9, 123))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.NormalInitializer(2.3, 1.9, 123))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -161,12 +187,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -181,12 +208,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -203,12 +231,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -223,12 +252,13 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -244,13 +274,14 @@ class TestXavierInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.XavierInitializer(
-                fan_in=12, fan_out=23, seed=134))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.XavierInitializer(
+                    fan_in=12, fan_out=23, seed=134))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -267,12 +298,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -287,12 +319,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer())
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -308,12 +341,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -328,12 +362,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        param = block.create_parameter(
-            dtype="float32",
-            shape=[5, 10, 15, 20],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(uniform=False))
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10, 15, 20],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(uniform=False))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -348,13 +383,14 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[5, 10],
-            lod_level=0,
-            name="param",
-            initializer=initializer.MSRAInitializer(
-                fan_in=12, seed=134))
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=initializer.MSRAInitializer(
+                    fan_in=12, seed=134))
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'uniform_random')
@@ -370,12 +406,13 @@ class TestMSRAInitializer(unittest.TestCase):
         """
         program = framework.Program()
         block = program.global_block()
-        block.create_parameter(
-            dtype="float32",
-            shape=[8, 1, 3, 3],
-            lod_level=0,
-            name="param",
-            initializer=initializer.BilinearInitializer())
+        for _ in range(2):
+            block.create_parameter(
+                dtype="float32",
+                shape=[8, 1, 3, 3],
+                lod_level=0,
+                name="param",
+                initializer=initializer.BilinearInitializer())
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 69365db4d1..295887ccd1 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -17,6 +17,7 @@ import numpy as np
 from operator import mul
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+from functools import reduce
 
 np.random.random(123)
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index ab2ab24f35..8f2dac786d 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -279,7 +279,7 @@ class TestBook(unittest.TestCase):
     def test_nce(self):
         window_size = 5
         words = []
-        for i in xrange(window_size):
+        for i in range(window_size):
             words.append(
                 layers.data(
                     name='word_{0}'.format(i), shape=[1], dtype='int64'))
@@ -288,7 +288,7 @@ class TestBook(unittest.TestCase):
         label_word = int(window_size / 2) + 1
 
         embs = []
-        for i in xrange(window_size):
+        for i in range(window_size):
             if i == label_word:
                 continue
 
@@ -465,6 +465,15 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_shape(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.shape(input, name="shape")
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index 16e85830ff..d53ead381d 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -36,7 +36,7 @@ class TestLoDRankTable(unittest.TestCase):
         exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
-        self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
+        self.assertEqual([(0, 5), (1, 1), (2, 1)], list(table.items()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
index 118c22fbb1..0ac6d9b81d 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -24,7 +24,7 @@ class TestLoDTensorArray(unittest.TestCase):
         tensor_array = arr.get_lod_tensor_array()
         self.assertEqual(0, len(tensor_array))
         cpu = core.CPUPlace()
-        for i in xrange(10):
+        for i in range(10):
             t = core.LoDTensor()
             t.set(numpy.array([i], dtype='float32'), cpu)
             t.set_recursive_sequence_lengths([[1]])
@@ -32,7 +32,7 @@ class TestLoDTensorArray(unittest.TestCase):
 
         self.assertEqual(10, len(tensor_array))
 
-        for i in xrange(10):
+        for i in range(10):
             t = tensor_array[i]
             self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
             self.assertEqual([[1]], t.recursive_sequence_lengths())
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index 5a4580116b..9789ff4af6 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -35,8 +35,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
         tensor.set_recursive_sequence_lengths([[3, 6, 1]])
-        expect = map(lambda x: numpy.array(x).astype('int32'),
-                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        expect = [
+            numpy.array(x).astype('int32')
+            for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
+        ]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -48,8 +50,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
         tensor.set_recursive_sequence_lengths([[3, 6, 0, 1]])
-        expect = map(lambda x: numpy.array(x).astype('int32'),
-                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        expect = [
+            numpy.array(x).astype('int32')
+            for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
+        ]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -111,8 +115,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         expect = [
             numpy.array(
                 item, dtype='int32')
-            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
-                22, 39) + range(7, 21), range(39, 46)]
+            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], list(
+                range(22, 39)) + list(range(7, 21)), list(range(39, 46))]
         ]
         lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]],
                [[2], [6, 1]]]
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index f8d5785fbf..ac25f432df 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -35,77 +35,59 @@ class TestLookupTableOp(OpTest):
         self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
 
 
+class TestLookupTableOpWithTensorIds(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table"
+        table = np.random.random((17, 31)).astype("float32")
+        ids = np.random.randint(
+            low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
+        self.inputs = {'W': table, 'Ids': ids}
+        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+
+
 class TestLookupTableOpWithPadding(TestLookupTableOp):
     def test_check_output(self):
         ids = np.squeeze(self.inputs['Ids'])
         padding_idx = np.random.choice(ids, 1)[0]
         self.outputs['Out'][ids == padding_idx] = np.zeros(31)
-        self.attrs = {'padding_idx': long(padding_idx)}
+        self.attrs = {'padding_idx': int(padding_idx)}
         self.check_output()
 
     def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of 
+        # Since paddings are not trainable and fixed in forward, the gradient of
         # paddings makes no sense and we don't test the gradient here.
         pass
 
 
-class TestLookupTableIdsIsSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Variable
-        height = 10
-        rows = [0, 4, 4, 7]
-        row_numel = 12
-
-        # create and initialize W Variable
-        W = scope.var('W').get_tensor()
-        W_array = np.full((height, row_numel), 1.0).astype("float32")
-        for i in range(height):
-            W_array[i] *= i
-        W.set(W_array, place)
-
-        # create and initialize Ids Variable
-        ids_selected_rows = scope.var('Ids').get_selected_rows()
-        ids_selected_rows.set_height(len(rows))
-        ids_selected_rows.set_rows(rows)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        ids_tensor = ids_selected_rows.get_tensor()
-        ids_tensor.set(np_array, place)
-
-        # create Out Variable
-        Out = scope.var('Out').get_selected_rows()
-
-        # create and run lookup_table operator
-        concat_rows_op = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
-        concat_rows_op.run(scope, place)
-
-        # get result from Out
-        Out_tensor = Out.get_tensor()
-        result_array = np.array(Out_tensor)
-
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(rows):
-            assert (row == result_array[idx]).all()
+class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
+    def test_check_output(self):
+        ids = self.inputs['Ids']
+        flatten_idx = ids.flatten()
+        padding_idx = np.random.choice(flatten_idx, 1)[0]
+        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
+        self.attrs = {'padding_idx': long(padding_idx)}
+        self.check_output()
 
-    def test_concat_rows(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        for place in places:
-            self.check_with_place(place)
+    def test_check_grad(self):
+        # Since paddings are not trainable and fixed in forward, the gradient of
+        # paddings makes no sense and we don't test the gradient here.
+        pass
 
 
 class TestLookupTableWIsSelectedRows(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        # create and initialize Id Variable
+    def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
         ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
         ids_tensor.set(ids_array, place)
+        return ids_array
 
-        # create and initialize W Variable
+    def prepare_w(self, scope, place):
         rows = [0, 1, 2, 3, 4, 5, 6]
         row_numel = 12
 
@@ -118,8 +100,22 @@ class TestLookupTableWIsSelectedRows(OpTest):
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
-        # create Out Variable
-        out_tensor = scope.var('Out').get_tensor()
+    def create_out_tensor(self, scope, place):
+        return scope.var('Out').get_tensor()
+
+    def check_result(self, ids_array, result_array):
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        ids_array = self.prepare_ids(scope, place)
+
+        self.prepare_w(scope, place)
+
+        out_tensor = self.create_out_tensor(scope, place)
 
         # create and run lookup_table operator
         lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
@@ -127,9 +123,8 @@ class TestLookupTableWIsSelectedRows(OpTest):
 
         # get result from Out
         result_array = np.array(out_tensor)
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array):
-            assert (row[0] == result_array[idx]).all()
+
+        self.check_result(ids_array, result_array)
 
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
@@ -138,5 +133,19 @@ class TestLookupTableWIsSelectedRows(OpTest):
             self.check_with_place(place)
 
 
+class TestLookupTableWithTensorIdsWIsSelectedRows(
+        TestLookupTableWIsSelectedRows):
+    def prepare_ids(self, scope, place):
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.random.randint(
+            low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
+        ids_tensor.set(ids_array, place)
+        return ids_array
+
+    def check_result(self, ids_array, result_array):
+        for idx, row in np.ndenumerate(ids_array):
+            assert (row == result_array[idx]).all()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py
index 64d42b693b..32b4ee1847 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
@@ -80,7 +80,7 @@ class TestMeanIOUOp(OpTest):
             'InCorrects': in_corrects,
             'InMeanIou': in_mean_ious
         }
-        self.attrs = {'num_classes': long(self.num_classes)}
+        self.attrs = {'num_classes': int(self.num_classes)}
         mean_iou, out_wrong, out_correct = compute_mean_iou(
             predictions, labels, self.num_classes, in_wrongs, in_corrects,
             in_mean_ious)
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index cfd6e63e12..67733807f8 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase):
         print(str(result_program))
 
 
+class TestMemoryTranspiler2(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            fc = layers.fc(input=x, size=10, act=None)
+            reshape = layers.reshape(x=fc, shape=[-1, 2, 5])
+            fc = layers.reshape(x=reshape, shape=[-1, 5, 2])
+            y_predict = layers.fc(input=fc, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(cost)
+            opt = optimizer.SGD(learning_rate=0.001)
+            opt.minimize(avg_cost)
+        self.program = program
+
+    def test_inplace_ops(self):
+        print("before optimization")
+        print(str(self.program))
+        result_program = memory_optimize(self.program)
+        print("after optimization")
+        print(str(result_program))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py
new file mode 100644
index 0000000000..f9daf83652
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import unittest
+
+
+def train_simulator(test_batch_size=10):
+    if test_batch_size <= 0:
+        raise ValueError("batch_size should be a positive integeral value, "
+                         "but got batch_size={}".format(test_batch_size))
+
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    # Calculate memory usage in current network config 
+    lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
+        fluid.default_main_program(), batch_size=test_batch_size)
+
+    print("memory usage is about %.3f - %.3f %s" %
+          (lower_usage, upper_usage, unit))
+
+
+class TestMemoryUsage(unittest.TestCase):
+    def test_with_unit_B(self):
+        with self.program_scope_guard():
+            train_simulator()
+
+    def test_with_unit_KB(self):
+        with self.program_scope_guard():
+            train_simulator(test_batch_size=1000)
+
+    def test_with_unit_MB(self):
+        with self.program_scope_guard():
+            train_simulator(test_batch_size=100000)
+
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index aacd8ae45a..10cb78a08d 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -112,7 +112,7 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
 
     if keep_top_k > -1 and num_det > keep_top_k:
         score_index = []
-        for c, indices in selected_indices.iteritems():
+        for c, indices in selected_indices.items():
             for idx in indices:
                 score_index.append((scores[c][idx], c, idx))
 
@@ -143,7 +143,7 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
         lod.append(nmsed_num)
         if nmsed_num == 0: continue
 
-        for c, indices in nmsed_outs.iteritems():
+        for c, indices in nmsed_outs.items():
             for idx in indices:
                 xmin, ymin, xmax, ymax = boxes[n][idx][:]
                 det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index 76ecc8ba08..7431a142c5 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -66,7 +66,7 @@ class TestNCE(OpTest):
         self.attrs = {
             'num_total_classes': num_classes,
             'num_neg_samples': num_neg_samples,
-            'custom_neg_classes': range(num_neg_samples)
+            'custom_neg_classes': list(range(num_neg_samples))
         }
         self.inputs = {
             'Input': input,
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index d13f2b3afd..06fccd39ac 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -28,13 +28,13 @@ class TestOneHotOp(OpTest):
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
         x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
 
-        for i in xrange(np.product(x.shape)):
+        for i in range(np.product(x.shape)):
             out[i, x[i]] = 1.0
 
         self.inputs = {'X': (x, x_lod)}
@@ -51,13 +51,13 @@ class TestOneHotOp_default_dtype(OpTest):
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
         x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
 
-        for i in xrange(np.product(x.shape)):
+        for i in range(np.product(x.shape)):
             out[i, x[i]] = 1.0
 
         self.inputs = {'X': (x, x_lod)}
@@ -76,7 +76,7 @@ class TestOneHotOp_exception(OpTest):
         self.dimension = 12
         self.x = core.LoDTensor()
         x_lod = [[4, 1, 3, 3]]
-        data = [np.random.randint(11, 20) for i in xrange(sum(x_lod[0]))]
+        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
         data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
         self.x.set(data, self.place)
         self.x.set_recursive_sequence_lengths(x_lod)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 63fb58c692..d17e493c36 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -167,10 +167,10 @@ class TestCRFModel(unittest.TestCase):
                 place=fluid.CPUPlace())
 
             data = train_data()
-            for i in xrange(10):
+            for i in range(10):
                 cur_batch = next(data)
-                print pe.run(feed=feeder.feed(cur_batch),
-                             fetch_list=[avg_cost.name])[0]
+                print(pe.run(feed=feeder.feed(cur_batch),
+                             fetch_list=[avg_cost.name])[0])
 
     @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_all_reduce(self):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 60d63364d5..a43f2e7c49 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -71,7 +71,7 @@ class TestFetchOp(unittest.TestCase):
 
             fetch_list = []
             all_vars = main.global_block().vars
-            for k, v in all_vars.iteritems():
+            for k, v in all_vars.items():
                 if 'tmp' not in k and k[0] is not '_' or v.persistable:
                     fetch_list.append(k)
 
@@ -90,7 +90,7 @@ class TestFetchOp(unittest.TestCase):
         iters = 3
         train_inputs = []
         for i in range(iters):
-            train_inputs.append(tst_reader_iter.next())
+            train_inputs.append(next(tst_reader_iter))
 
         os.environ['CPU_NUM'] = str(4)
         if core.is_compiled_with_cuda():
@@ -133,7 +133,7 @@ class TestFeedParallel(unittest.TestCase):
 
         for batch_id, data in enumerate(reader()):
             loss_np = pe.run(feed=data, fetch_list=[loss.name])[0]
-            print batch_id, loss_np
+            print(batch_id, loss_np)
             if batch_id == 2:
                 break
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 76389d916f..9448d89cd5 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -37,7 +37,7 @@ def simple_fc_net(use_feed):
         reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
     hidden = img
-    for _ in xrange(4):
+    for _ in range(4):
         hidden = fluid.layers.fc(
             hidden,
             size=200,
@@ -64,7 +64,7 @@ def fc_with_batchnorm(use_feed):
         img, label = fluid.layers.read_file(reader)
 
     hidden = img
-    for _ in xrange(1):
+    for _ in range(1):
         hidden = fluid.layers.fc(
             hidden,
             size=200,
@@ -98,16 +98,13 @@ class TestMNIST(TestParallelExecutorBase):
             fluid.recordio_writer.convert_reader_to_recordio_file(
                 MNIST_RECORDIO_FILE, reader, feeder)
 
-    def _init_data(self, random=True):
+    def _init_data(self):
         np.random.seed(5)
-        if random:
-            img = np.random.random(size=[32, 784]).astype(np.float32)
-        else:
-            img = np.ones(shape=[32, 784], dtype='float32')
+        img = np.random.random(size=[32, 784]).astype(np.float32)
         label = np.ones(shape=[32, 1], dtype='int64')
         return img, label
 
-    def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True):
+    def _compare_reduce_and_allreduce(self, model, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
         self.check_network_convergence(
@@ -115,7 +112,7 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_network_convergence(
             model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
 
-        img, label = self._init_data(random_data)
+        img, label = self._init_data()
 
         all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
             model,
@@ -131,9 +128,9 @@ class TestMNIST(TestParallelExecutorBase):
             use_reduce=True)
 
         for loss in zip(all_reduce_first_loss, reduce_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
         for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-4)
 
     # simple_fc
     def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
@@ -166,27 +163,27 @@ class TestMNIST(TestParallelExecutorBase):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
-        img, label = self._init_data(random=False)
+        img, label = self._init_data()
 
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            seed=1000,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_parallel_executor=False)
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
             method=simple_fc_net,
-            seed=1000,
+            seed=1,
             feed_dict={"image": img,
                        "label": label},
             use_cuda=use_cuda,
             use_parallel_executor=True)
 
-        for p_f in parallel_first_loss:
-            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
-        for p_l in parallel_last_loss:
-            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss), single_first_loss, delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
 
     def test_simple_fc_parallel_accuracy(self):
         self.check_simple_fc_parallel_accuracy(True)
@@ -211,7 +208,8 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_batchnorm_fc_convergence(False)
 
     def test_batchnorm_fc_with_new_strategy(self):
-        self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
+        # FIXME(zcd): close this test temporally.
+        # self._compare_reduce_and_allreduce(fc_with_batchnorm, True)
         self._compare_reduce_and_allreduce(fc_with_batchnorm, False)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index 834e920845..a28428d8de 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -21,6 +21,19 @@ from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import math
 import os
+import numpy as np
+
+# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
+# and Executor is different. Because, for ParallelExecutor, the dropout_op of
+# the neural net will be copied N copies(N is the number of device). This will
+# lead to the random numbers generated by ParallelExecutor and Executor are different.
+# So, if we compare the loss of ParallelExecutor and Executor, we should remove the
+# dropout_op.
+remove_dropout = False
+
+# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor
+# and Executor is different.
+remove_bn = False
 
 
 def squeeze_excitation(input, num_channels, reduction_ratio):
@@ -53,7 +66,8 @@ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
         groups=groups,
         act=None,
         bias_attr=False)
-    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+    return conv if remove_bn else fluid.layers.batch_norm(
+        input=conv, act=act, momentum=0.1)
 
 
 def shortcut(input, ch_out, stride):
@@ -92,13 +106,14 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-def SE_ResNeXt50Small(batch_size=2, use_feed=False):
-    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
+batch_size = 12
+img_shape = [3, 224, 224]
+
 
-    img = fluid.layers.fill_constant(
-        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
-    label = fluid.layers.fill_constant(
-        shape=[batch_size, 1], dtype='int64', value=0.0)
+def SE_ResNeXt50Small(use_feed):
+
+    img = fluid.layers.data(name='image', shape=img_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
     conv = conv_bn_layer(
         input=img, num_filters=16, filter_size=3, stride=2, act='relu')
@@ -127,7 +142,8 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
     reshape = fluid.layers.reshape(
         x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
     pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+    dropout = pool if remove_dropout else fluid.layers.dropout(
+        x=pool, dropout_prob=0.2, seed=1)
     # Classifier layer:
     prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
@@ -135,75 +151,135 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
     return loss
 
 
-class TestResnet(TestParallelExecutorBase):
-    def check_resnet_convergence_with_learning_rate_decay(self,
-                                                          use_cuda=True,
-                                                          use_reduce=False,
-                                                          iter=20):
+def cosine_decay(learning_rate, step_each_epoch, epochs=120):
+    """
+    Applies cosine decay to the learning rate.
+    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
+    """
+    global_step = _decay_step_counter()
 
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
+    with init_on_cpu():
+        epoch = ops.floor(global_step / step_each_epoch)
+        decayed_lr = learning_rate * \
+                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
+    return decayed_lr
 
-        os.environ['CPU_NUM'] = str(4)
 
-        def _cosine_decay(learning_rate, step_each_epoch, epochs=120):
-            """
-            Applies cosine decay to the learning rate.
-            lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
-            """
-            global_step = _decay_step_counter()
+def optimizer(learning_rate=0.01):
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=cosine_decay(
+            learning_rate=learning_rate, step_each_epoch=2, epochs=1),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    return optimizer
 
-            with init_on_cpu():
-                epoch = ops.floor(global_step / step_each_epoch)
-                decayed_lr = learning_rate * \
-                            (ops.cos(epoch * (math.pi / epochs)) + 1)/2
-            return decayed_lr
 
-        def _optimizer(learning_rate=0.01):
-            optimizer = fluid.optimizer.Momentum(
-                learning_rate=_cosine_decay(
-                    learning_rate=learning_rate, step_each_epoch=2, epochs=1),
-                momentum=0.9,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            return optimizer
+class TestResnet(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        global remove_dropout
+        global remove_bn
+        remove_dropout = False
+        remove_bn = False
+
+    def _init_data(self, batch_size=2, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(
+                size=[batch_size] + img_shape).astype(np.float32)
+        else:
+            img = np.ones(shape=[batch_size] + img_shape, dtype='float32')
+        label = [np.random.randint(0, 999) for _ in range(batch_size)]
+        label = np.array(label).astype(np.int64).reshape(-1, 1)
+        return img, label
+
+    def _compare_reduce_and_allreduce(self,
+                                      model,
+                                      use_cuda,
+                                      iter=20,
+                                      delta2=1e-6):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
 
-        import functools
+        global remove_bn
+        remove_bn = True
 
-        batch_size = 2
+        img, label = self._init_data(batch_size=batch_size)
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=False,
+            optimizer=optimizer)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=True,
+            optimizer=optimizer)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+    def _check_resnet_convergence(self,
+                                  model,
+                                  use_cuda=True,
+                                  use_reduce=False,
+                                  iter=20,
+                                  delta2=1e-6):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
 
+        global remove_dropout
+        global remove_bn
+        remove_dropout = True
+        remove_bn = True
+
+        img, label = self._init_data(batch_size=batch_size)
         single_first_loss, single_last_loss = self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt50Small, batch_size=batch_size),
+            model,
+            feed_dict={"image": img,
+                       "label": label},
             iter=iter,
             batch_size=batch_size,
             use_cuda=use_cuda,
             use_reduce=use_reduce,
-            optimizer=_optimizer,
+            optimizer=optimizer,
             use_parallel_executor=False)
-
         parallel_first_loss, parallel_last_loss = self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt50Small, batch_size=batch_size),
+            model,
+            feed_dict={"image": img,
+                       "label": label},
             iter=iter,
             batch_size=batch_size,
             use_cuda=use_cuda,
             use_reduce=use_reduce,
-            optimizer=_optimizer)
+            optimizer=optimizer)
 
-        for p_f in parallel_first_loss:
-            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
-        for p_l in parallel_last_loss:
-            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6)
+        self.assertAlmostEquals(
+            np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
 
     def test_seresnext_with_learning_rate_decay(self):
-        self.check_resnet_convergence_with_learning_rate_decay(True, False)
-        self.check_resnet_convergence_with_learning_rate_decay(
-            False, False, iter=5)
-
-    def test_seresnext_with_new_strategy_with_learning_rate_decay(self):
-        self.check_resnet_convergence_with_learning_rate_decay(True, True)
-        self.check_resnet_convergence_with_learning_rate_decay(
-            False, True, iter=5)
+        self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
+        self._check_resnet_convergence(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
+
+    def test_seresnext_with_new_strategy(self):
+        self._compare_reduce_and_allreduce(
+            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
+        self._compare_reduce_and_allreduce(
+            model=SE_ResNeXt50Small, use_cuda=False, iter=5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 7688b8495d..fcb5947ff0 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -25,7 +25,7 @@ def simple_fc_net():
     img = fluid.layers.data(name='image', shape=[784], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     hidden = img
-    for _ in xrange(4):
+    for _ in range(4):
         hidden = fluid.layers.fc(
             hidden,
             size=200,
@@ -71,7 +71,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                 share_vars_from=train_exe,
                 build_strategy=build_strategy)
 
-            for i in xrange(5):
+            for i in range(5):
                 test_loss, = test_exe.run([loss.name], feed=feed_dict)
 
                 train_loss, = train_exe.run([loss.name], feed=feed_dict)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index b6215fddb1..8203d5d1fc 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.dataset.wmt16 as wmt16
 import os
 
-WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
+WMT16_RECORDIO_FILE = "/tmp/wmt16.recordio"
 
 
 class ModelHyperParams(object):
@@ -167,10 +167,9 @@ class TestTransformer(TestParallelExecutorBase):
                     writer.append_tensor(t)
                 writer.complete_append_tensor()
 
-    @unittest.skip("transformer is buggy in multi gpu")
     def test_main(self):
         self.check_network_convergence(transformer, use_cuda=True)
-        self.check_network_convergence(transformer, use_cuda=False)
+        self.check_network_convergence(transformer, use_cuda=False, iter=5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
index 18309f4577..c9617e3677 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -18,6 +18,7 @@ import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
 import paddle.fluid.profiler as profiler
 import numpy
+import six
 
 
 class BaseParallelForTest(unittest.TestCase):
@@ -25,20 +26,20 @@ class BaseParallelForTest(unittest.TestCase):
         """
         Run the unittest for parallel.for
         Args:
-            callback(callable): A callable function returns a generator. There 
-                are two yields in the generator function. The first yield 
-                returns the data layers, and the second yield returns the loss. 
-                The modified data variables will be sent back during the first 
+            callback(callable): A callable function returns a generator. There
+                are two yields in the generator function. The first yield
+                returns the data layers, and the second yield returns the loss.
+                The modified data variables will be sent back during the first
                 yield.
 
             feed(dict): The executor feeding dictionary.
-            fetch(list|basestr): The fetch name lists. 
+            fetch(list|basestr): The fetch name lists.
 
         Returns:
             None
 
         Raises:
-            AssertionError when the computation of cpu, parallel.for in cpu, 
+            AssertionError when the computation of cpu, parallel.for in cpu,
                 gpu, parallel.for in gpu are different.
 
         """
@@ -95,14 +96,14 @@ class BaseParallelForTest(unittest.TestCase):
         """
         Run a single test, returns the fetch values
         Args:
-            place(Place): the computation place. 
-            use_parallel(bool): Whether use parallel.for or not. 
+            place(Place): the computation place.
+            use_parallel(bool): Whether use parallel.for or not.
 
         Returns:
             Fetched numpy arrays.
 
         """
-        if isinstance(fetch, basestring):
+        if isinstance(fetch, six.string_types):
             fetch = [fetch]
         main = fluid.Program()
         startup = fluid.Program()
@@ -124,7 +125,7 @@ class BaseParallelForTest(unittest.TestCase):
                     data = [data]
 
                 with pd.do():
-                    ins = map(pd.read_input, data)
+                    ins = list(map(pd.read_input, data))
                     if len(ins) == 1:
                         ins = ins[0]
                     loss = generator.send(ins)  # patch input
@@ -156,7 +157,7 @@ class BaseParallelForTest(unittest.TestCase):
 
         Returns:
             None
-            
+
         Raises:
             AssertionError
 
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
index 2105d32066..8aff4e87f6 100644
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -23,9 +23,9 @@ def PolygonBoxRestore(input):
     geo_channels = shape[1]
     h = shape[2]
     w = shape[3]
-    h_indexes = np.array(range(h) * w).reshape(
+    h_indexes = np.array(list(range(h)) * w).reshape(
         [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
-    w_indexes = np.array(range(w) * h).reshape(
+    w_indexes = np.array(list(range(w)) * h).reshape(
         [h, w])[np.newaxis, :]  # [1, h, w]
     indexes = np.concatenate(
         (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index f7e1e85732..1cf70311b4 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -35,8 +35,8 @@ def max_pool2D_forward_naive(x,
              ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
                                                    paddings[1]) / strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
             r_start = np.max((i * strides[0] - paddings[0], 0))
             r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
             c_start = np.max((j * strides[1] - paddings[1], 0))
@@ -63,8 +63,8 @@ def avg_pool2D_forward_naive(x,
              ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
                                                    paddings[1]) / strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
             r_start = np.max((i * strides[0] - paddings[0], 0))
             r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
             c_start = np.max((j * strides[1] - paddings[1], 0))
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 142165f29b..92c64b3792 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -38,13 +38,13 @@ def max_pool3D_forward_naive(x,
              ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
                                                    paddings[2]) / strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
         d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
             h_start = np.max((i * strides[0] - paddings[0], 0))
             h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                 w_start = np.max((j * strides[1] - paddings[1], 0))
                 w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
@@ -72,13 +72,13 @@ def avg_pool3D_forward_naive(x,
              ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
                                                    paddings[2]) / strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
         d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
             h_start = np.max((i * strides[0] - paddings[0], 0))
             h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                 w_start = np.max((j * strides[1] - paddings[1], 0))
                 w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
index cf9b763922..e6a9f6f08c 100644
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
@@ -29,21 +29,21 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
     W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
     out = np.zeros((N, C, D_out, H_out, W_out))
     mask = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
         d_start = np.max((k * strides[0] - paddings[0], 0))
         d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
             h_start = np.max((i * strides[0] - paddings[0], 0))
             h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                 w_start = np.max((j * strides[1] - paddings[1], 0))
                 w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
 
                 out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
 
-                for n in xrange(N):
-                    for c in xrange(C):
+                for n in range(N):
+                    for c in range(C):
                         arr = x_masked[n, c, :, :, :]
                         index = np.where(arr == np.max(arr))
                         sub_deep = index[0][0]
@@ -67,8 +67,8 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
     W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
     out = np.zeros((N, C, H_out, W_out))
     mask = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
             r_start = np.max((i * strides[0] - paddings[0], 0))
             r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
             c_start = np.max((j * strides[1] - paddings[1], 0))
@@ -77,8 +77,8 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
 
             out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
 
-            for n in xrange(N):
-                for c in xrange(C):
+            for n in range(N):
+                for c in range(C):
                     arr = x_masked[n, c, :, :]
                     index = np.where(arr == np.max(arr))
                     sub_row = index[0][0]
diff --git a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
index 091cfc9c72..8c76393bda 100644
--- a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
@@ -32,7 +32,7 @@ def py_pnpair_op(score, label, query, column=-1, weight=None):
 
     # accumulate statistics
     pos, neg, neu = 0, 0, 0
-    for _, ranks in predictions.items():
+    for _, ranks in list(predictions.items()):
         for e1, e2 in itertools.combinations(ranks, 2):
             s1, s2, l1, l2, w1, w2 = e1[0], e2[0], e1[1], e2[1], e1[2], e2[2]
             w = (w1 + w2) * 0.5
diff --git a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
index 7830ba2958..5ae425fee1 100644
--- a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
+++ b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
@@ -39,19 +39,19 @@ def get_states(idxs, labels, cls_num, weights=None):
     ins_num = idxs.shape[0]
     # TP FP TN FN
     states = np.zeros((cls_num, 4)).astype('float32')
-    for i in xrange(ins_num):
+    for i in range(ins_num):
         w = weights[i] if weights is not None else 1.0
         idx = idxs[i][0]
         label = labels[i][0]
         if idx == label:
             states[idx][0] += w
-            for j in xrange(cls_num):
+            for j in range(cls_num):
                 states[j][2] += w
             states[idx][2] -= w
         else:
             states[label][3] += w
             states[idx][1] += w
-            for j in xrange(cls_num):
+            for j in range(cls_num):
                 states[j][2] += w
             states[label][2] -= w
             states[idx][2] -= w
@@ -64,7 +64,7 @@ def compute_metrics(states, cls_num):
     total_fn_count = 0.0
     macro_avg_precision = 0.0
     macro_avg_recall = 0.0
-    for i in xrange(cls_num):
+    for i in range(cls_num):
         total_tp_count += states[i][0]
         total_fp_count += states[i][1]
         total_fn_count += states[i][3]
@@ -90,9 +90,9 @@ class TestPrecisionRecallOp_0(OpTest):
         ins_num = 64
         cls_num = 10
         max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         states = get_states(idxs, labels, cls_num)
         metrics = compute_metrics(states, cls_num)
@@ -117,10 +117,10 @@ class TestPrecisionRecallOp_1(OpTest):
         ins_num = 64
         cls_num = 10
         max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
 
         states = get_states(idxs, labels, cls_num, weights)
@@ -151,10 +151,10 @@ class TestPrecisionRecallOp_2(OpTest):
         ins_num = 64
         cls_num = 10
         max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
             (ins_num, 1)).astype('int32')
         states = np.random.randint(0, 30, (cls_num, 4)).astype('float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index f75a79bfa4..621dd68134 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -183,7 +183,7 @@ class TestBlockDesc(unittest.TestCase):
         op2 = block.append_op()
         op0 = block._prepend_op()
         all_ops = []
-        for idx in xrange(0, block.op_size()):
+        for idx in range(0, block.op_size()):
             all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op1, op2])
 
@@ -205,7 +205,7 @@ class TestBlockDesc(unittest.TestCase):
         program._sync_with_cpp()
 
         all_ops = []
-        for idx in xrange(0, block.op_size()):
+        for idx in range(0, block.op_size()):
             all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op2])
 
diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py
index d35183647e..3ad85d5748 100644
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -21,7 +21,7 @@ import unittest
 class TestReaderReset(unittest.TestCase):
     def prepare_data(self):
         def fake_data_generator():
-            for n in xrange(self.total_ins_num):
+            for n in range(self.total_ins_num):
                 yield np.ones(self.ins_shape) * n, n
 
         # Prepare data
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index d6ff18430e..2e22df2beb 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -203,12 +203,12 @@ class RecurrentOpTest1(unittest.TestCase):
                     num_grad[idx], ana_grad[idx], rtol=0.1).all())
 
     def check_forward(self):
-        print 'test recurrent op forward'
+        print('test recurrent op forward')
         pd_output = self.forward()
         py_output = self.py_rnn.forward()
-        print 'pd_output', pd_output
+        print('pd_output', pd_output)
         print
-        print 'py_output', py_output
+        print('py_output', py_output)
         self.assertEqual(pd_output.shape, py_output.shape)
         self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())
 
@@ -445,7 +445,7 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
         self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
                                                             self.output_shape)
         self.output = layers.mean(self.create_rnn_op(), **self.p_info)
-        print self.main_program
+        print(self.main_program)
 
     def create_rnn_op(self):
         x = layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index f51b5a7e99..2f5558578a 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -25,7 +25,7 @@ class TestReshapeOp(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
@@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
 
     def test_check_output(self):
@@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest):
 
         self.op_type = "reshape"
         self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape, "inplace": False}
+        self.attrs = {"shape": new_shape}
         self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index 9701d9adef..1a6e1aad79 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -26,9 +26,9 @@ class TestSeqProject(OpTest):
         if self.context_length == 1 \
                 and self.context_start == 0 \
                 and self.padding_trainable:
-            print "If context_start is 0 " \
+            print("If context_start is 0 " \
                   "and context_length is 1," \
-                  " padding_trainable should be false."
+                  " padding_trainable should be false.")
             return
 
         # one level, batch size
@@ -212,7 +212,7 @@ class TestSeqProjectCase2(TestSeqProject):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        idx = range(self.input_size[0])
+        idx = list(range(self.input_size[0]))
         del idx[0]
         offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
                       [self.input_size[0]]]
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 0bbd31814e..5ff0dab23e 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -44,7 +44,7 @@ class TestSequenceExpand(OpTest):
             out_lod = [[]]
 
         offset = 0
-        for i in xrange(len(y_lod[ref_level])):
+        for i in range(len(y_lod[ref_level])):
             repeat_num = y_lod[ref_level][i]
             x_len = x_idx[i]
 
@@ -55,7 +55,7 @@ class TestSequenceExpand(OpTest):
                     stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
                 out = np.vstack((out, stacked_x_sub))
                 if x_lod is not None:
-                    for j in xrange(repeat_num):
+                    for j in range(repeat_num):
                         out_lod[0].append(x_len)
             offset += x_len
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
index 68f2e5eba3..39b02ecf6d 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
@@ -35,7 +35,7 @@ class TestSequenceReshape(OpTest):
     def compute_output(self, x, x_lod, dimension):
         x_width = x.shape[1]
         out_lod = [[]]
-        for i in xrange(len(x_lod[0])):
+        for i in range(len(x_lod[0])):
             seq_len = x_lod[0][i]
             offset = (seq_len * x_width) / dimension
             assert int(offset) * dimension == seq_len * x_width
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index 6f0e337034..a994bf181a 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -48,7 +48,7 @@ class TestShrinkRNNMemoryBase(unittest.TestCase):
 
     def sum_lodtensor(self, tensor):
         sum_res = 0.0
-        for i in xrange(np.product(tensor.shape())):
+        for i in range(np.product(tensor.shape())):
             sum_res += tensor._get_float_element(i)
         return sum_res
 
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 0ab581cfb0..70ad05597c 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -26,15 +26,22 @@ def stable_softmax(x):
 
 
 class TestSoftmaxOp(OpTest):
+    def get_x_shape(self):
+        return [10, 10]
+
     def setUp(self):
         self.op_type = "softmax"
         self.use_cudnn = False
         self.use_mkldnn = False
         self.dtype = np.float32
         self.init_kernel_type()
+        self.shape = self.get_x_shape()
+
+        x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, 1,
+                                  x.reshape([-1, self.shape[-1]]))
+        out = out.reshape(self.shape)
 
-        x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
-        out = np.apply_along_axis(stable_softmax, 1, x)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
         self.attrs = {
@@ -63,6 +70,11 @@ class TestSoftmaxOp(OpTest):
             self.check_grad(["X"], "Out", max_relative_error=0.01)
 
 
+class TestSoftmaxOp2(TestSoftmaxOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
@@ -70,6 +82,13 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp):
         self.use_cudnn = True
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):
@@ -83,6 +102,13 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16Op2(TestSoftmaxFP16Op):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
@@ -97,10 +123,22 @@ class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
                 self.check_output_with_place(place, atol=1e-3)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
 
 
+class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
index e9f0a06a56..ca78613098 100644
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -15,6 +15,8 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class TestSplitIdsOp(OpTest):
@@ -31,5 +33,55 @@ class TestSplitIdsOp(OpTest):
         self.check_output()
 
 
+class TestSpliteIds(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        rows = [0, 5, 7, 4, 9]
+        height = 20
+        row_numel = 2
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(rows)
+        x.set_height(height)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            for j in range(row_numel):
+                np_array[i, j] = rows[i] + j
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        outs_name = ["out%d" % i for i in xrange(3)]
+        outs = [
+            scope.var(var_name).get_selected_rows() for var_name in outs_name
+        ]
+
+        # expected output selected rows
+        expected_out_rows = [[0, 9], [7, 4], [5]]
+
+        op = Operator("split_ids", Ids="X", Out=outs_name)
+
+        for _ in range(3):
+            op.run(scope, place)
+
+            for i in range(len(outs)):
+                expected_rows = expected_out_rows[i]
+                self.assertEqual(outs[i].rows(), expected_rows)
+                for j in range(len(expected_rows)):
+                    row = expected_rows[j]
+                    self.assertAlmostEqual(
+                        float(row), np.array(outs[i].get_tensor())[j, 0])
+                    self.assertAlmostEqual(
+                        float(row + 1), np.array(outs[i].get_tensor())[j, 1])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index eb49a53e54..6b67a52e81 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -26,7 +26,7 @@ class TestSplitOp(OpTest):
         self.inputs = {'X': x}
         self.attrs = {'axis': axis, 'sections': [2, 1, 2]}
         self.outputs = {'Out': [('out%d' % i, out[i]) \
-            for i in xrange(len(out))]}
+            for i in range(len(out))]}
 
     def _set_op_type(self):
         self.op_type = "split"
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
index 61040a39ce..2b261820e0 100644
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -53,7 +53,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
         height_sections = [5, 5, 5, 5, 3]
 
         # initialize output variables [out0, out1]
-        outs_name = ["out%d" % i for i in xrange(len(height_sections))]
+        outs_name = ["out%d" % i for i in range(len(height_sections))]
         outs = [
             scope.var(var_name).get_selected_rows() for var_name in outs_name
         ]
diff --git a/python/paddle/fluid/tests/unittests/test_spp_op.py b/python/paddle/fluid/tests/unittests/test_spp_op.py
index f0ab5909df..3cbfc2a703 100644
--- a/python/paddle/fluid/tests/unittests/test_spp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spp_op.py
@@ -26,7 +26,7 @@ class TestSppOp(OpTest):
         input = np.random.random(self.shape).astype("float32")
         nsize, csize, hsize, wsize = input.shape
         out_level_flatten = []
-        for i in xrange(self.pyramid_height):
+        for i in range(self.pyramid_height):
             bins = np.power(2, i)
             kernel_size = [0, 0]
             padding = [0, 0]
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index cc2fcc5ec0..cbc3da5503 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -28,7 +28,7 @@ class TestTopkOp(OpTest):
         self.inputs = {'X': input}
         self.attrs = {'k': k}
 
-        for rowid in xrange(32):
+        for rowid in range(32):
             row = input[rowid]
             output[rowid] = np.sort(row)[-k:]
             indices[rowid] = row.argsort()[-k:]
@@ -52,7 +52,7 @@ class TestTopkOp3d(OpTest):
         self.inputs = {'X': input_flat_2d}
         self.attrs = {'k': k}
 
-        for rowid in xrange(64):
+        for rowid in range(64):
             row = input_flat_2d[rowid]
             output[rowid] = np.sort(row)[-k:]
             indices[rowid] = row.argsort()[-k:]
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
index a97d6dfdda..ecce4cdde2 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -22,10 +22,10 @@ def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
     out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
     out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
     out = np.zeros((s0, s1, out_hsize, out_wsize))
-    for nidx in xrange(s0):
-        for cidx in xrange(s1):
-            for h in xrange(s2):
-                for w in xrange(s3):
+    for nidx in range(s0):
+        for cidx in range(s1):
+            for h in range(s2):
+                for w in range(s3):
                     index = indices[nidx, cidx, h, w]
                     hidx = (index - index % out_wsize) / out_wsize
                     widx = index % out_wsize
@@ -47,16 +47,16 @@ class TestUnpoolOp(OpTest):
                 self.strides[1] + 1
         input = np.zeros((nsize, csize, hsize_out, wsize_out))
         indices = np.zeros((nsize, csize, hsize_out, wsize_out))
-        for i in xrange(hsize_out):
-            for j in xrange(wsize_out):
+        for i in range(hsize_out):
+            for j in range(wsize_out):
                 r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
                 r_end = np.min((i * self.strides[0] + self.ksize[0] - \
                         self.paddings[0], hsize))
                 c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
                 c_end = np.min((j * self.strides[1] + self.ksize[1] - \
                         self.paddings[1], wsize))
-                for nidx in xrange(nsize):
-                    for cidx in xrange(csize):
+                for nidx in range(nsize):
+                    for cidx in range(csize):
                         x_masked = pre_input[nidx, cidx, r_start:r_end, \
                                 c_start:c_end]
                         input[nidx, cidx, i, j] = x_masked.max()
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index fe8808bc04..790e6afe5f 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -66,7 +66,7 @@ class TestWhileOp(unittest.TestCase):
         exe = Executor(cpu)
         d = []
 
-        for i in xrange(3):
+        for i in range(3):
             d.append(numpy.random.random(size=[10]).astype('float32'))
 
         outs = exe.run(feed={'d0': d[0],
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index 55c6e54906..c6e176ca31 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -18,14 +18,6 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def as_lodtensor(np_array, lod, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_value, place)
-    if lod is not None:
-        tensor.set_recursive_sequence_lengths(lod)
-    return tensor
-
-
 def create_op(scope, op_type, inputs, outputs, attrs):
     kwargs = dict()
 
@@ -69,6 +61,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
 
 
 def set_input(scope, op, inputs, place):
+    def np_value_to_fluid_value(input):
+        if input.dtype == np.float16:
+            input = input.view(np.uint16)
+        return input
+
     def __set_input__(var_name, var):
         if isinstance(var, tuple) or isinstance(var, np.ndarray):
             tensor = scope.find_var(var_name).get_tensor()
@@ -76,7 +73,7 @@ def set_input(scope, op, inputs, place):
                 tensor.set_recursive_sequence_lengths(var[1])
                 var = var[0]
             tensor._set_dims(var.shape)
-            tensor.set(var, place)
+            tensor.set(np_value_to_fluid_value(var), place)
         elif isinstance(var, float):
             scope.find_var(var_name).set_float(var)
         elif isinstance(var, int):
@@ -104,6 +101,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
         if name not in np_list:
             assert var_proto.intermediate, "{} not found".format(name)
         else:
+            # inferece the dtype from numpy value.
             np_value = np_list[name]
             if isinstance(np_value, tuple):
                 dtype = np_value[0].dtype
@@ -116,6 +114,16 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
                 if is_input:
                     shape = list(np_value.shape)
                     lod_level = 0
+        # NOTE(dzhwinter): type hacking
+        # numpy float16 is binded to paddle::platform::float16
+        # in tensor_py.h via the help of uint16 datatype. Because
+        # the internal memory representation of float16 is
+        # actually uint16_t in paddle. So we use np.uint16 in numpy for
+        # raw memory, it can pass through the pybind. So in the testcase,
+        # we feed data use data.view(uint16), but the dtype is float16 in fact.
+        # The data.view(uint16) means do not cast the data type, but process data as the uint16
+        if dtype == np.uint16:
+            dtype = np.float16
         return block.create_var(
             dtype=dtype, shape=shape, lod_level=lod_level, name=name)
 
@@ -142,7 +150,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):
 
 
 def append_loss_ops(block, output_names):
-    mean_inputs = map(block.var, output_names)
+    mean_inputs = list(map(block.var, output_names))
     # for item in mean_inputs:
     #     print(item)
     #     print("Item", item.dtype)
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
index c62792face..17ab875f6a 100644
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -118,8 +118,9 @@ def multi_head_attention(queries,
         # FIXME(guosheng): Decouple the program desc with batch_size.
         return layers.reshape(
             x=trans_x,
-            shape=map(int,
-                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=list(
+                map(int, [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]
+                          ])))
 
     def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
         """
@@ -403,7 +404,7 @@ def transformer(
         trg_pad_idx,
         pos_pad_idx, ):
     file_obj = fluid.layers.open_recordio_file(
-        filename='./wmt16.recordio',
+        filename='/tmp/wmt16.recordio',
         shapes=[
             [batch_size * max_length, 1],
             [batch_size * max_length, 1],
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 6084e860ed..1c4565a83c 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -18,16 +18,15 @@ import errno
 import shutil
 import time
 
-import core
-
-import data_feeder
-import executor
-import framework
-import io
+from . import core
+from . import data_feeder
+from . import executor
+from . import framework
+from . import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
-import optimizer as opt_module
-import parallel_executor
-from transpiler import distribute_transpiler
+from . import optimizer as opt_module
+from . import parallel_executor
+from .transpiler import distribute_transpiler
 
 __all__ = [
     'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
@@ -658,11 +657,12 @@ def build_feed_var_list(program, feed_order):
         if not isinstance(feed_order, dict):
             raise TypeError(
                 "The 'feed_order' should be either None, list or dict.")
-        if not sorted(feed_order.values()) == range(len(feed_order)):
+        if not sorted(feed_order.values()) == list(range(len(feed_order))):
             raise ValueError(
                 "The values of 'feed_order' should be a permutation of [0, len(feed_order))"
             )
-        sorted_pair_list = sorted(feed_order.items(), key=lambda item: item[1])
+        sorted_pair_list = sorted(
+            list(feed_order.items()), key=lambda item: item[1])
         feed_var_list = [
             program.global_block().var(pair[0]) for pair in sorted_pair_list
         ]
@@ -1041,8 +1041,8 @@ def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
         lookup_table(string): the lookup table name, when use distribute
             lookup table, we can get lookup table name by DistributeTranspiler.
             table_name
-        pserver_endpoints(list): the parameter server ip:port list.
-            when use distribute lookup table, we can get pserver_endpoints by
+        ps_endpoint_list(list): the parameter server ip:port list.
+            when use distribute lookup table, we can get ps_endpoint_list by
             distribute arguments.
     Return:
         None
@@ -1080,7 +1080,7 @@ def _save_trainer_args(dirname, trainer_id, trainer_args):
 
     cur_dir = _get_trainer_dir(dirname, trainer_id)
 
-    for name, value in trainer_args.iteritems():
+    for name, value in list(trainer_args.items()):
         args_file = os.path.join(cur_dir, name)
         with open(args_file, 'w') as f:
             f.write(str(value))
@@ -1218,10 +1218,10 @@ def _scroll_delete(dirname, max_num_checkpoints=3):
         serial_num = _get_dir_serial(serial)
         serial_map[serial_num] = serial
 
-    if len(serial_map.keys()) <= max_num_checkpoints:
+    if len(list(serial_map.keys())) <= max_num_checkpoints:
         return
 
-    serials = serial_map.keys()
+    serials = list(serial_map.keys())
     serials.sort(reverse=True)
     serials = serials[max_num_checkpoints:]
     for serial in serials:
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
index eae13b5039..a8622ad544 100644
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
-from inference_transpiler import InferenceTranspiler
-from memory_optimization_transpiler import memory_optimize, release_memory
-from ps_dispatcher import HashName, RoundRobin
+from .distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
+from .inference_transpiler import InferenceTranspiler
+from .memory_optimization_transpiler import memory_optimize, release_memory
+from .ps_dispatcher import HashName, RoundRobin
 
 __all__ = [
     "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
index dc597c3384..1bfab1f219 100644
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from program_utils import *
-from ufind import *
+from .program_utils import *
+from .ufind import *
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index 2ca1d4716b..76d10777f5 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -17,8 +17,8 @@ def delete_ops(block, ops):
     try:
         start = list(block.ops).index(ops[0])
         end = list(block.ops).index(ops[-1])
-        [block._remove_op(start) for _ in xrange(end - start + 1)]
-    except Exception, e:
+        [block._remove_op(start) for _ in range(end - start + 1)]
+    except Exception as e:
         raise e
     block.program._sync_with_cpp()
 
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 2dd9ec3e47..15675b4e9f 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -28,18 +28,17 @@ Steps to transpile pserver:
 5. add listen_and_serv op
 """
 
-from __future__ import print_function
-
 import math
 import random
 import numpy as np
 
-from ps_dispatcher import RoundRobin, HashName, PSDispatcher
+from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
                         default_startup_program, Block, \
                         Parameter, grad_var_name
-from details import *
+from .details import *
+from functools import reduce
 
 LOOKUP_TABLE_TYPE = "lookup_table"
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
@@ -102,7 +101,7 @@ def slice_variable(var_list, slice_count, min_block_size):
                 block_size += dim1 - remains
         # update split_count after aligning
         split_count = int(math.ceil(var_numel / float(block_size)))
-        for block_id in xrange(split_count):
+        for block_id in range(split_count):
             curr_block_size = min(block_size, var_numel - (
                 (block_id) * block_size))
             block = VarBlock(var.name, block_id, curr_block_size)
@@ -117,7 +116,7 @@ class DistributeTranspilerConfig(object):
         try to choose the best method to balance loads for pservers.
     min_block_size (int): Minimum splitted element number in block.
         According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
-        We can use bandwidth effiently when data size is larger than 2MB.If you 
+        We can use bandwidth effiently when data size is larger than 2MB.If you
         want to change it, please be sure you see the slice_variable function.
     """
 
@@ -218,7 +217,7 @@ class DistributeTranspiler(object):
         #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
         #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
         # shuffle the map will avoid the uneven distribution above
-        grad_var_mapping_items = self.grad_var_mapping.items()
+        grad_var_mapping_items = list(self.grad_var_mapping.items())
         if not self.config.slice_var_up:
             random.seed(self.trainer_num)
             random.shuffle(grad_var_mapping_items)
@@ -278,7 +277,7 @@ class DistributeTranspiler(object):
             self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
 
         # step4: Concat the parameters splits together after recv.
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in list(self.param_var_mapping.items()):
             eps = []
             for var in splited_var:
                 index = [v.name for v in recv_vars].index(var.name)
@@ -293,16 +292,17 @@ class DistributeTranspiler(object):
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
-        program.global_block().append_op(
-            type="fetch_barrier",
-            inputs={},
-            outputs={},
-            attrs={
-                "endpoints": pserver_endpoints,
-                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-            })
+        if self.sync_mode:
+            program.global_block().append_op(
+                type="fetch_barrier",
+                inputs={},
+                outputs={},
+                attrs={
+                    "endpoints": pserver_endpoints,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
 
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in list(self.param_var_mapping.items()):
             if len(splited_var) <= 1:
                 continue
             orig_param = program.global_block().vars[varname]
@@ -347,6 +347,7 @@ class DistributeTranspiler(object):
 
         # step1
         pserver_program = Program()
+        pserver_program.random_seed = self.origin_program.random_seed
         # step2: Create vars to receive vars at parameter servers.
         recv_inputs = []
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
@@ -371,7 +372,7 @@ class DistributeTranspiler(object):
                     dtype=v.dtype,
                     shape=v.shape)
             if self.sync_mode and self.trainer_num > 1:
-                for trainer_id in xrange(self.trainer_num):
+                for trainer_id in range(self.trainer_num):
                     var = pserver_program.global_block().create_var(
                         name="%s.trainer_%d" % (orig_var_name, trainer_id),
                         persistable=False,
@@ -461,7 +462,7 @@ class DistributeTranspiler(object):
             per_opt_block = pserver_program.create_block(pre_block_idx)
             optimize_blocks.append(per_opt_block)
             # append grad merging ops before clip and weight decay
-            # cases may like: 
+            # cases may like:
             # L2Decay op -> clip op -> optimize
             for _, op in enumerate(self.optimize_ops):
                 # find the origin @GRAD var before clipping
@@ -494,6 +495,7 @@ class DistributeTranspiler(object):
             pserver_index = self.pserver_endpoints.index(endpoint)
             table_opt_block = self._create_table_optimize_block(
                 pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
+            optimize_blocks.append(table_opt_block)
             prefetch_var_name_to_block_id = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
             checkpoint_block_id = self._create_checkpoint_save_block(
@@ -548,6 +550,7 @@ class DistributeTranspiler(object):
         """
         s_prog = Program()
         orig_s_prog = default_startup_program()
+        s_prog.random_seed = orig_s_prog.random_seed
         params = self.param_grad_ep_mapping[endpoint]["params"]
 
         def _get_splited_name_and_shape(varname):
@@ -560,7 +563,7 @@ class DistributeTranspiler(object):
         # 1. create vars in pserver program to startup program
         pserver_vars = pserver_program.global_block().vars
         created_var_map = dict()
-        for _, var in pserver_vars.iteritems():
+        for _, var in list(pserver_vars.items()):
             tmpvar = s_prog.global_block()._clone_variable(var)
             created_var_map[var.name] = tmpvar
 
@@ -810,7 +813,9 @@ class DistributeTranspiler(object):
                         outputs={"Out": prefetch_output_vars},
                         attrs={
                             "epmap": pserver_endpoints,
-                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                            # FIXME(qiao) temporarily disable this config because prefetch
+                            # is not act as other rpc op, it's more like a forward op
+                            # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                         })
 
                     # insert concat_op
@@ -922,8 +927,6 @@ class DistributeTranspiler(object):
             self.table_name
         ][0]
         table_opt_block = pserver_program.create_block(pre_block_idx)
-        # only support sgd now
-        assert table_opt_op.type == "sgd"
 
         if self.sync_mode:
             # create grad vars in pserver program
@@ -963,11 +966,12 @@ class DistributeTranspiler(object):
             "LearningRate": [lr_var]
         }
         outputs = {"ParamOut": [param_var]}
-        table_opt_block.append_op(
-            type=table_opt_op.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=table_opt_op.attrs)
+        # only support sgd now
+        import logging
+        logging.warn(
+            "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
+            + table_opt_op.type)
+        table_opt_block.append_op(type="sgd", inputs=inputs, outputs=outputs)
 
         # add table parameter gradient and it's block id to grad_to_block_id
         grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
@@ -1018,11 +1022,11 @@ class DistributeTranspiler(object):
         var_mapping = dict()
         for block_str in block_list:
             varname, offset, size = block_str.split(":")
-            if not block_map.has_key(varname):
+            if varname not in block_map:
                 block_map[varname] = []
-            block_map[varname].append((long(offset), long(size)))
+            block_map[varname].append((int(offset), int(size)))
 
-        for varname, splited in block_map.iteritems():
+        for varname, splited in list(block_map.items()):
             orig_var = program.global_block().var(varname)
             if len(splited) == 1:
                 if self.sync_mode and add_trainer_suffix:
@@ -1185,7 +1189,7 @@ class DistributeTranspiler(object):
         grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
         if self.sync_mode and self.trainer_num > 1:
             vars2merge = []
-            for i in xrange(self.trainer_num):
+            for i in range(self.trainer_num):
                 per_trainer_name = "%s.trainer_%d" % \
                 (merged_var_name, i)
                 vars2merge.append(pserver_block.vars[per_trainer_name])
@@ -1233,7 +1237,7 @@ class DistributeTranspiler(object):
                 # learning rate variable has already be created by non-optimize op,
                 # don't create it once again.
                 lr_varname = opt_op.input(key)[0]
-                if pserver_block.vars.has_key(lr_varname):
+                if lr_varname in pserver_block.vars:
                     new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]]
                 else:
                     origin_var = origin_program.global_block().vars[lr_varname]
@@ -1273,7 +1277,9 @@ class DistributeTranspiler(object):
 
     def _is_splited_grad_var(self, var, var_dict):
         grad_block = None
-        for _, g in var_dict.iteritems():
+        # TODO(minqiyang): replace these items() with six.iteritems() to
+        # improve memory
+        for _, g in list(var_dict.items()):
             if self._orig_varname(g.name) == self._orig_varname(var.name):
                 if g.name.find(".trainer_") == -1:
                     grad_block = g
@@ -1283,7 +1289,7 @@ class DistributeTranspiler(object):
     def _clone_lr_op(self, program, block, op):
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, op)
-        for key, varlist in inputs.iteritems():
+        for key, varlist in list(inputs.items()):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1292,7 +1298,7 @@ class DistributeTranspiler(object):
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, op)
-        for key, varlist in outputs.iteritems():
+        for key, varlist in list(outputs.items()):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1307,7 +1313,7 @@ class DistributeTranspiler(object):
         # Append the ops for parameters that do not need to be optimized/updated
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for key, varlist in inputs.iteritems():
+        for key, varlist in list(inputs.items()):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1317,7 +1323,7 @@ class DistributeTranspiler(object):
                     var, program.global_block().vars)
                 if grad_block:
                     inputs[key] = grad_block
-                elif not program.global_block().vars.has_key(var.name):
+                elif var.name not in program.global_block().vars:
                     program.global_block().create_var(
                         name=var.name,
                         persistable=var.persistable,
@@ -1326,7 +1332,7 @@ class DistributeTranspiler(object):
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for key, varlist in outputs.iteritems():
+        for key, varlist in list(outputs.items()):
             if not isinstance(varlist, list):
                 varlist = [varlist]
             for var in varlist:
@@ -1334,7 +1340,7 @@ class DistributeTranspiler(object):
                     var, program.global_block().vars)
                 if grad_block:
                     outputs[key] = grad_block
-                elif not program.global_block().vars.has_key(var.name):
+                elif var.name not in program.global_block().vars:
                     program.global_block()._clone_variable(var)
 
         return optimize_block.append_op(
@@ -1355,8 +1361,8 @@ class DistributeTranspiler(object):
     def _create_ufind(self, optimize_ops):
         # Create a unit find data struct by optimize ops
         ufind = UnionFind(optimize_ops)
-        for i in xrange(len(optimize_ops)):
-            for j in xrange(i, len(optimize_ops)):
+        for i in range(len(optimize_ops)):
+            for j in range(i, len(optimize_ops)):
                 op1 = optimize_ops[i]
                 op2 = optimize_ops[j]
                 if self._is_op_connected(op1, op2):
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index f1905f0878..142fa5c31d 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -305,6 +305,6 @@ class InferenceTranspiler(object):
             args += current_op.output_arg_names
         args = list(set(args))  # unique the input and output arguments
 
-        for var in self.block.vars.keys():
+        for var in list(self.block.vars.keys()):
             if var not in args:
                 self.block._remove_var(var)
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 0ca5cf813b..20ba7ed2b0 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -16,6 +16,8 @@ from collections import defaultdict
 from .. import core
 from ..framework import Program, default_main_program, Parameter
 from ..backward import _rename_arg_
+from functools import reduce
+from six.moves import range
 
 dtype_to_size = {
     core.VarDesc.VarType.FP16: 2,
@@ -107,7 +109,7 @@ class ControlFlowGraph(object):
         # Repeatedly apply liveness updates until the algorithm stablize
         # on a complete set live input vars and live output vars.
         while True:
-            for i in reversed(range(self.op_size)):
+            for i in reversed(list(range(self.op_size))):
                 live_in[i] = set(self._live_in[i])
                 live_out[i] = set(self._live_out[i])
                 for s in self._successors[i]:
@@ -172,9 +174,10 @@ class ControlFlowGraph(object):
             is_forward = i < self._forward_num
             in_diff, out_diff = self._get_diff(self._live_in[i],
                                                self._live_out[i])
-            can_optimize = filter(
-                lambda x: self._check_var_validity(block_desc, x, is_forward),
-                in_diff)
+            can_optimize = [
+                x for x in in_diff
+                if self._check_var_validity(block_desc, x, is_forward)
+            ]
             if can_optimize:
                 index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1
                 delete_op = block_desc._insert_op(index)
@@ -213,9 +216,10 @@ class ControlFlowGraph(object):
             block_desc = op.block()
             is_forward = i < self._forward_num
             if self.pool:
-                defs_can_optimize = filter(
-                    lambda x: self._check_var_validity(block_desc, x, is_forward),
-                    self._defs[i])
+                defs_can_optimize = [
+                    x for x in self._defs[i]
+                    if self._check_var_validity(block_desc, x, is_forward)
+                ]
                 out_pair = [
                     (x, self._find_var(block_desc, x, is_forward).shape())
                     for x in defs_can_optimize
@@ -261,9 +265,10 @@ class ControlFlowGraph(object):
                         break
 
             in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
-            can_optimize = filter(
-                lambda x: self._check_var_validity(block_desc, x, is_forward),
-                in_diff)
+            can_optimize = [
+                x for x in in_diff
+                if self._check_var_validity(block_desc, x, is_forward)
+            ]
             if can_optimize:
                 for var_name in can_optimize:
                     self.pool.append((var_name, self._find_var(
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index 776619cd36..b125eba4f8 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -14,6 +14,7 @@
 
 import collections
 import contextlib
+import six
 import sys
 
 __all__ = ['generate', 'switch', 'guard']
@@ -67,8 +68,10 @@ def switch(new_generator=None):
 
 @contextlib.contextmanager
 def guard(new_generator=None):
-    if isinstance(new_generator, basestring):
+    if isinstance(new_generator, six.string_types):
         new_generator = UniqueNameGenerator(new_generator)
+    elif isinstance(new_generator, six.binary_type):
+        new_generator = UniqueNameGenerator(new_generator.decode())
     old = switch(new_generator)
     yield
     switch(old)
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
index 4c905d959f..c861020225 100644
--- a/python/paddle/reader/creator.py
+++ b/python/paddle/reader/creator.py
@@ -67,11 +67,14 @@ def recordio(paths, buf_size=100):
 
     import recordio as rec
     import paddle.reader.decorator as dec
-    import cPickle as pickle
+    import six
+    import six.moves.cPickle as pickle
 
     def reader():
-        if isinstance(paths, basestring):
+        if isinstance(paths, six.string_types):
             path = paths
+        elif isinstance(paths, six.binary_type):
+            path = paths.decode()
         else:
             path = ",".join(paths)
         f = rec.reader(path)
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 4b1fe94222..ce410e61b9 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -21,6 +21,9 @@ from threading import Thread
 import subprocess
 
 from six.moves.queue import Queue
+from six.moves import zip_longest
+from six.moves import map
+from six.moves import zip
 import itertools
 import random
 import zlib
@@ -42,7 +45,7 @@ def map_readers(func, *readers):
         rs = []
         for r in readers:
             rs.append(r())
-        for e in itertools.imap(func, *rs):
+        for e in map(func, *rs):
             yield e
 
     return reader
@@ -148,16 +151,16 @@ def compose(*readers, **kwargs):
         for r in readers:
             rs.append(r())
         if not check_alignment:
-            for outputs in itertools.izip(*rs):
-                yield sum(map(make_tuple, outputs), ())
+            for outputs in zip(*rs):
+                yield sum(list(map(make_tuple, outputs)), ())
         else:
-            for outputs in itertools.izip_longest(*rs):
+            for outputs in zip_longest(*rs):
                 for o in outputs:
                     if o is None:
                         # None will be not be present if compose is aligned
                         raise ComposeNotAligned(
                             "outputs of readers are not aligned.")
-                yield sum(map(make_tuple, outputs), ())
+                yield sum(list(map(make_tuple, outputs)), ())
 
     return reader
 
@@ -306,7 +309,7 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
         args = (in_queue, out_queue, mapper, out_order) if order else (
             in_queue, out_queue, mapper)
         workers = []
-        for i in xrange(process_num):
+        for i in range(process_num):
             worker = Thread(target=target, args=args)
             worker.daemon = True
             workers.append(worker)
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index bee24d3b65..537df489b9 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -136,7 +136,7 @@ class TestXmap(unittest.TestCase):
                     reader = paddle.reader.xmap_readers(mapper,
                                                         reader_creator_10(0),
                                                         tNum, size, order)
-                    for n in xrange(3):
+                    for n in range(3):
                         result = []
                         for i in reader():
                             result.append(i)
@@ -156,7 +156,7 @@ class TestPipeReader(unittest.TestCase):
 
         import tempfile
 
-        records = [str(i) for i in xrange(5)]
+        records = [str(i) for i in range(5)]
         temp = tempfile.NamedTemporaryFile()
         try:
             with open(temp.name, 'w') as f:
diff --git a/python/paddle/trainer/PyDataProviderWrapper.py b/python/paddle/trainer/PyDataProviderWrapper.py
index 6af2507728..374976db9f 100644
--- a/python/paddle/trainer/PyDataProviderWrapper.py
+++ b/python/paddle/trainer/PyDataProviderWrapper.py
@@ -42,7 +42,7 @@ except ImportError:
 try:
     import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle
 
 import io
 
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index ab9a2562dc..a2a32d848c 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -20,7 +20,7 @@ from .utils import deprecated
 try:
     import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle
 
 __all__ = ['define_py_data_sources2']
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index d9787ef42a..ee34c15733 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -28,7 +28,7 @@ from .default_decorators import *
 try:
     import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle
 import copy
 
 __all__ = [
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 0d544efac9..8312900dc4 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -29,13 +29,13 @@ __all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
 VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
 TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
 EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
 
 UNK_IDX = 0
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index 5104e29051..1ec210f265 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -41,7 +41,7 @@ URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
              'wmt_shrinked_data/wmt14.tgz')
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
 # BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
 MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 2c65222c8a..aa14d3a2a1 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,7 +4,7 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then
+    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*|patches/grpc/.*) ]]; then
         continue;
     else
         cpplint --filter=-readability/fn_size $file;
diff --git a/tools/diff_api.py b/tools/diff_api.py
index cf9f2c72cb..97c739ed2a 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -20,9 +20,7 @@ for each_diff in result:
     if each_diff[0] in ['-', '?']:  # delete or change API is not allowed
         error = True
     elif each_diff[0] == '+':
-        # only new layers is allowed.
-        if not each_diff.startswith('+ paddle.fluid.layers.'):
-            error = True
+        error = True
 
     if each_diff[0] != ' ':
         print(each_diff)
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index bca0b77ad7..0d59e4c110 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 
-RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
+RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh && \
   bash build_scripts/install_nccl2.sh && rm -r build_scripts
@@ -40,11 +40,13 @@ RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddl
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 9dc750b890..2d6a3cf8a9 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -12,19 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
 import unittest
 import os
 import sys
 import paddle.fluid as fluid
 import importlib
-import cStringIO
+from six.moves import cStringIO
 
 
 def main():
     sys.path.append(os.getcwd())
     some_test_failed = False
     for module_name in sys.argv[1:]:
-        buffer = cStringIO.StringIO()
+        buffer = cStringIO()
         main = fluid.Program()
         startup = fluid.Program()
         scope = fluid.core.Scope()
@@ -37,8 +38,11 @@ def main():
                     res = unittest.TextTestRunner(stream=buffer).run(tests)
                     if not res.wasSuccessful():
                         some_test_failed = True
-                        print >> sys.stderr, module_name, 'failed\n', buffer.getvalue(
-                        )
+                        print(
+                            module_name,
+                            'failed\n',
+                            buffer.getvalue(),
+                            file=sys.stderr)
 
     if some_test_failed:
         exit(1)