diff --git a/.gitignore b/.gitignore
index 90138f996c..fa0c888260 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ third_party/
 build_*
 # clion workspace.
 cmake-build-*
+model_test
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6aa2e1715b..ed704585d8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,13 +62,12 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
-option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
-option(WITH_INFERENCE    "Compile fluid inference library"              ON)
+option(ON_INFER         "Turn on inference optimization."               OFF)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
@@ -179,6 +178,7 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
+include(external/xxhash)    # download xxhash
 
 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
@@ -301,3 +301,11 @@ if(WITH_DOC)
     find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
+
+if (ON_INFER)
+    message(STATUS "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+else()
+    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+endif()
diff --git a/Dockerfile b/Dockerfile
index 738bba9bc2..c8b9eed6d6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -75,14 +75,14 @@ RUN pip3 install -U wheel && \
     pip3 install -U docopt PyYAML sphinx==1.5.6 && \
     pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
     easy_install -U pip && \
-    pip install -U wheel && \
+    pip install -U pip setuptools wheel && \
     pip install -U docopt PyYAML sphinx==1.5.6 && \
     pip install sphinx-rtd-theme==0.1.9 recommonmark
 
-RUN pip3 install pre-commit 'ipython==5.3.0' && \
+RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip3 install opencv-python && \
-    pip install pre-commit 'ipython==5.3.0' && \
+    pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip install opencv-python
 
diff --git a/README.md b/README.md
index 8ee67f6642..56d6c10c64 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 
-### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0)
+### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.0.1.post87
+pip install paddlepaddle-gpu==1.1.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.0.1.post85
+pip install paddlepaddle-gpu==1.1.0.post85
 
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.0.1.post85
 
 ## Installation
 
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
 
 ## Documentation
 
-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
 
    We appreciate your contributions!
 
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 9540900b11..ff616ddbb2 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -142,5 +142,10 @@ def parse_args():
         choices=['reduce', 'all_reduce'],
         default='all_reduce',
         help='Specify the reduce strategy, can be reduce, all_reduce')
+    parser.add_argument(
+        '--fuse_broadcast_op',
+        action='store_true',
+        help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
+    )
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index ddd9fe8098..5f3ce300ac 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -177,6 +177,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
     else:
         build_strategy.reduce_strategy = fluid.BuildStrategy(
         ).ReduceStrategy.AllReduce
+    build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
 
     avg_loss = train_args[0]
 
@@ -240,7 +241,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
 
             if args.use_fake_data or args.use_reader_op:
                 try:
-
                     fetch_ret = exe.run(fetch_list)
                 except fluid.core.EOFException as eof:
                     break
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
new file mode 100644
index 0000000000..c227e09719
--- /dev/null
+++ b/cmake/external/xxhash.cmake
@@ -0,0 +1,50 @@
+INCLUDE(ExternalProject)
+
+set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
+set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
+set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
+
+IF(WITH_STATIC_LIB)
+  SET(BUILD_CMD make lib)
+ELSE()
+  IF(APPLE)
+    SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
+  ELSE(APPLE)
+    SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
+  ENDIF(APPLE)
+ENDIF()
+
+ExternalProject_Add(
+    extern_xxhash
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
+    GIT_TAG         "v0.6.5"
+    PREFIX          ${XXHASH_SOURCE_DIR}
+    DOWNLOAD_NAME   "xxhash"
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    PATCH_COMMAND
+    BUILD_COMMAND     ${BUILD_CMD}
+    INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
+    TEST_COMMAND      ""
+)
+
+set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
+INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
+
+add_library(xxhash STATIC IMPORTED GLOBAL)
+set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
+include_directories(${XXHASH_INCLUDE_DIR})
+add_dependencies(xxhash extern_xxhash)
+
+LIST(APPEND external_project_dependencies xxhash)
+
+IF(WITH_C_API)
+  INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
+  IF(ANDROID)
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
+  ELSE()
+    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
+  ENDIF()
+ENDIF()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 67cca09b64..efdb093a7b 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -31,7 +31,7 @@ function(copy TARGET)
     foreach(index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+        add_custom_command(TARGET ${TARGET} PRE_BUILD
           COMMAND mkdir -p "${dst}"
           COMMAND cp -r "${src}" "${dst}"
           COMMENT "copying ${src} -> ${dst}")
@@ -67,6 +67,13 @@ copy(boost_lib
   DEPS boost
 )
 
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
+copy(xxhash_lib
+  SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
+  DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS xxhash
+)
+
 if(NOT PROTOBUF_FOUND)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
     copy(protobuf_lib
@@ -186,7 +193,7 @@ copy(cmake_cache
   DSTS ${FLUID_INSTALL_DIR})
 
 # This command generates a complete fluid library for both train and inference
-add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) 
+add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
 
 # Following commands generate a inference-only fluid library
 # third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 6653244507..6b665a9eff 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 
 add_subdirectory(testing)
+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
   add_subdirectory(fluid)
 endif()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 19ef23cdfa..b6b7af9510 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -64,11 +64,11 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
 paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
-paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
-paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
-paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
+paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
+paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -86,7 +86,7 @@ paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name'
 paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
+paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer'))
 paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None))
 paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None))
@@ -103,11 +103,11 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
-paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
-paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None))
+paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None))
 paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None))
@@ -174,7 +174,13 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
+paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
+paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
+paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -353,6 +359,8 @@ paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_wind
 paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None))
+paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
 paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,))
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 48b36df649..7d48f00571 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,8 +9,6 @@ add_subdirectory(pybind)
 add_subdirectory(recordio)
 endif(NOT WIN32)
 
-if(WITH_INFERENCE)
-  # NOTE: please add subdirectory inference at last.
-  add_subdirectory(inference)
-  add_subdirectory(train)
-endif()
+# NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
+add_subdirectory(train)
diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
index 0dcecb62db..fabf2abfc8 100644
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@@ -64,6 +64,13 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
     case proto::AttrType::LONG: {
       return attr_desc.l();
     }
+    case proto::AttrType::LONGS: {
+      std::vector<int64_t> val(attr_desc.longs_size());
+      for (int i = 0; i < attr_desc.longs_size(); ++i) {
+        val[i] = attr_desc.longs(i);
+      }
+      return val;
+    }
     default:
       PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
   }
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index 14ca3e9620..d9c76881b7 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -26,6 +26,113 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+template <typename T>
+struct ExtractAttribute {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  T* operator()(Attribute& attr) const {
+    T* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<T>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
+                   attr_name_, paddle::platform::demangle(typeid(T).name()),
+                   paddle::platform::demangle(attr.type().name()));
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+// special handle bool
+// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
+// hard to change the logic there. In another way, we should correct handle
+// if the user set `some_flag=1`.
+//
+// FIX ME anytime if there is a better solution.
+template <>
+struct ExtractAttribute<bool> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  bool* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<bool>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      float val = boost::get<float>(attr);
+      attr = static_cast<bool>(val);
+    }
+    bool* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<bool>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+template <>
+struct ExtractAttribute<int64_t> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  int64_t* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<int64_t>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      int val = boost::get<float>(attr);
+      attr = static_cast<int64_t>(val);
+    }
+    int64_t* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<int64_t>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+template <>
+struct ExtractAttribute<std::vector<int64_t>> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  std::vector<int64_t>* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(std::vector<int>)) {  // NOLINT
+      std::vector<int> val = boost::get<std::vector<int>>(attr);
+      std::vector<int64_t> vec(val.begin(), val.end());
+      attr = vec;
+    } else if (attr.type() == typeid(std::vector<float>)) {  // NOLINT
+      std::vector<float> val = boost::get<std::vector<float>>(attr);
+      std::vector<int64_t> vec(val.begin(), val.end());
+      attr = vec;
+    }
+    std::vector<int64_t>* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<std::vector<int64_t>>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
 template <typename T>
 inline proto::AttrType AttrTypeID() {
   Attribute tmp = T();
@@ -42,7 +149,11 @@ class AttrReader {
   inline const T& Get(const std::string& name) const {
     PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
                    name);
-    return boost::get<T>(attrs_.at(name));
+
+    Attribute& attr = const_cast<Attribute&>(attrs_.at(name));
+    ExtractAttribute<T> extract_attr(name);
+    T* attr_value = extract_attr(attr);
+    return *attr_value;
   }
 
  private:
@@ -82,7 +193,7 @@ class DefaultValueSetter {
  public:
   explicit DefaultValueSetter(T default_value)
       : default_value_(default_value) {}
-  void operator()(T& value) const { value = default_value_; }
+  void operator()(T& value) const { value = default_value_; }  // NOLINT
 
  private:
   T default_value_;
@@ -117,84 +228,6 @@ class EnumInContainer {
   std::unordered_set<T> container_;
 };
 
-template <typename T>
-struct ExtractAttribute {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  T* operator()(Attribute& attr) const {
-    T* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<T>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
-                   attr_name_, paddle::platform::demangle(typeid(T).name()),
-                   paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-// special handle bool
-// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
-// hard to change the logic there. In another way, we should correct handle
-// if the user set `some_flag=1`.
-//
-// FIX ME anytime if there is a better solution.
-template <>
-struct ExtractAttribute<bool> {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  bool* operator()(Attribute& attr) const {
-    if (attr.type() == typeid(int)) {  // NOLINT
-      int val = boost::get<int>(attr);
-      attr = static_cast<bool>(val);
-    } else if (attr.type() == typeid(float)) {  // NOLINT
-      float val = boost::get<float>(attr);
-      attr = static_cast<bool>(val);
-    }
-    bool* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<bool>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
-template <>
-struct ExtractAttribute<int64_t> {
-  explicit ExtractAttribute(const std::string& attr_name)
-      : attr_name_(attr_name) {}
-
-  int64_t* operator()(Attribute& attr) const {
-    if (attr.type() == typeid(int)) {  // NOLINT
-      int val = boost::get<int>(attr);
-      attr = static_cast<int64_t>(val);
-    } else if (attr.type() == typeid(float)) {  // NOLINT
-      int val = boost::get<float>(attr);
-      attr = static_cast<int64_t>(val);
-    }
-    int64_t* attr_value = nullptr;
-    try {
-      attr_value = &boost::get<int64_t>(attr);
-    } catch (boost::bad_get& bad_get) {
-      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, paddle::platform::demangle(attr.type().name()));
-    }
-    return attr_value;
-  }
-
-  const std::string& attr_name_;
-};
-
 // check whether a certain attribute fit its limits
 // an attribute can have more than one limits
 template <typename T>
@@ -235,7 +268,7 @@ class TypedAttrChecker {
     return *this;
   }
 
-  void operator()(AttributeMap& attr_map) const {
+  void operator()(AttributeMap& attr_map) const {  // NOLINT
     if (!attr_map.count(attr_name_)) {
       // user do not set this attr
       PADDLE_ENFORCE(!default_value_setter_.empty(),
@@ -271,7 +304,7 @@ class OpAttrChecker {
     return *(checker.target<TypedAttrChecker<T>>());
   }
 
-  void Check(AttributeMap& attr_map) const {
+  void Check(AttributeMap& attr_map) const {  // NOLINT
     for (const auto& checker : attr_checkers_) {
       checker(attr_map);
     }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index e0a3ef5a9c..d8bc72e6b2 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -16,12 +16,14 @@ if(WITH_GPU)
             dynload_cuda variable_visitor)
     nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+    nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 
 else()
     cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
              variable_visitor)
     cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
     cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+    cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()
 
 cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
@@ -33,13 +35,15 @@ if(WITH_GPU)
           all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 endif()
 
+cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
+
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
 
 if(WITH_GPU)
-  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass)
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass sequential_execution_pass)
 else()
-  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto sequential_execution_pass)
 endif()
 
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
@@ -54,8 +58,9 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
         DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
+cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
 
 cc_library(build_strategy SRCS build_strategy.cc DEPS
         graph_viz_pass multi_devices_graph_pass
         multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fuse_elewise_add_act_pass)
+        fuse_elewise_add_act_pass multi_batch_merge_pass)
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 7c5f5bd80a..b869015676 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
       nccl_ctxs_(ctxs) {
   if (nccl_ctxs_) {
     for (auto &p : places_) {
-      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
+      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
     }
   }
 }
@@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
 
   if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
             *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
         auto &p = places_[i];
         auto *var = scope.FindVar(out_var_handles[i]->name_);
-        auto *dev_ctx = dev_ctxes_[p];
+        auto *dev_ctx = dev_ctxes_.at(p);
 
         RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
           auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 4fdab5cd94..7f0d06c892 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -48,16 +48,27 @@ void BroadcastOpHandle::RunImpl() {
     var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
   }
 
+  BroadcastOneVar(*in_var_handle, out_var_handles, var_scopes);
+}
+
+void BroadcastOpHandle::BroadcastOneVar(
+    const VarHandle &in_var_handle,
+    const std::vector<VarHandle *> &out_var_handles,
+    const std::vector<const Scope *> &var_scopes) {
   auto *in_var =
-      var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
+      var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_);
   PADDLE_ENFORCE_NOT_NULL(in_var);
   Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
+  if (UNLIKELY(!in_tensor.IsInitialized())) {
+    VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!";
+    return;
+  }
 
-  InitOutputValue(*in_var_handle, out_var_handles);
+  InitOutputValue(in_var_handle, out_var_handles);
 
   if (platform::is_cpu_place(in_tensor.place())) {
     for (auto *out_var_handle : out_var_handles) {
-      if (out_var_handle->IsTheSameVar(*in_var_handle)) {
+      if (out_var_handle->IsTheSameVar(in_var_handle)) {
         continue;
       }
       auto &out_p = out_var_handle->place_;
@@ -114,12 +125,12 @@ void BroadcastOpHandle::RunImpl() {
         }
       }
 
-      if (!out_handle->IsTheSameVar(*in_var_handle)) {
-        auto out_var = var_scopes.at(in_var_handle->scope_idx_)
+      if (!out_handle->IsTheSameVar(in_var_handle)) {
+        auto out_var = var_scopes.at(in_var_handle.scope_idx_)
                            ->FindVar(out_var_handles[0]->name_);
         paddle::framework::TensorCopy(
-            in_tensor, in_var_handle->place_,
-            *(dev_ctxes_.at(in_var_handle->place_)),
+            in_tensor, in_var_handle.place_,
+            *(dev_ctxes_.at(in_var_handle.place_)),
             &VariableVisitor::GetMutableTensor(out_var));
       }
     });
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index fe4e733e43..72180fac86 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase {
         nccl_ctxs_(nccl_ctxs) {
     if (nccl_ctxs_) {
       for (auto &p_ctx : nccl_ctxs_->contexts_) {
-        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+        this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
+                               p_ctx.second.ctx_.get());
       }
     }
   }
@@ -61,7 +62,10 @@ struct BroadcastOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
- private:
+  void BroadcastOneVar(const VarHandle &in_var_handle,
+                       const std::vector<VarHandle *> &out_var_handles,
+                       const std::vector<const Scope *> &var_scopes);
+
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
index ab7412a19f..650de5a48d 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -12,232 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/broadcast_op_handle.h"
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-// test data amount
-const f::DDim kDims = {20, 20};
-
-struct TestBroadcastOpHandle {
-  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
-  std::vector<Scope*> local_scopes_;
-  std::vector<Scope*> param_scopes_;
-  Scope g_scope_;
-  std::unique_ptr<OpHandleBase> op_handle_;
-  std::vector<std::unique_ptr<VarHandleBase>> vars_;
-  std::vector<p::Place> gpu_list_;
-  bool use_gpu_;
-#ifdef PADDLE_WITH_CUDA
-  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
-#endif
-
-  void WaitAll() {
-    for (size_t j = 0; j < ctxs_.size(); ++j) {
-      ctxs_[j]->Wait();
-    }
-#ifdef PADDLE_WITH_CUDA
-    if (nccl_ctxs_) {
-      nccl_ctxs_->WaitAll();
-    }
-#endif
-  }
-
-  void InitCtxOnGpu(bool use_gpu) {
-    use_gpu_ = use_gpu;
-    if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
-      int count = p::GetCUDADeviceCount();
-      if (count <= 1) {
-        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
-                        "device count is "
-                     << count;
-        exit(0);
-      }
-      for (int i = 0; i < count; ++i) {
-        auto p = p::CUDAPlace(i);
-        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
-      }
-      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
-#else
-      PADDLE_THROW("CUDA is not support.");
-#endif
-    } else {
-      int count = 8;
-      for (int i = 0; i < count; ++i) {
-        auto p = p::CPUPlace();
-        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
-      }
-#ifdef PADDLE_WITH_CUDA
-      nccl_ctxs_.reset(nullptr);
-#endif
-    }
-  }
-
-  void InitBroadcastOp(size_t input_scope_idx) {
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      local_scopes_.push_back(&(g_scope_.NewScope()));
-      Scope& local_scope = local_scopes_.back()->NewScope();
-      *local_scopes_.back()
-           ->Var(details::kLocalExecScopeName)
-           ->GetMutable<Scope*>() = &local_scope;
-      local_scope.Var("out");
-      param_scopes_.emplace_back(&local_scope);
-    }
-    param_scopes_[input_scope_idx]->Var("input");
-
-    std::unique_ptr<ir::Node> n =
-        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
-    if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
-                                             nccl_ctxs_.get()));
-#else
-      PADDLE_THROW("CUDA is not support.");
-#endif
-    } else {
-#ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
-                                             nccl_ctxs_.get()));
-#else
-      op_handle_.reset(
-          new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
-#endif
-    }
-
-    std::unique_ptr<ir::Node> v =
-        ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
-    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
-                                        gpu_list_[input_scope_idx]);
-    vars_.emplace_back(in_var_handle);
-    op_handle_->AddInput(in_var_handle);
-
-    // add dummy var
-
-    std::unique_ptr<ir::Node> v2 =
-        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
-    vars_.emplace_back(new DummyVarHandle(v2.get()));
-    DummyVarHandle* dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
-    dummy_var_handle->ClearGeneratedOp();
-    op_handle_->AddInput(dummy_var_handle);
-
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      if (!use_gpu_) {
-        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
-      }
-      std::unique_ptr<ir::Node> v3 =
-          ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
-      VarHandle* out_var_handle =
-          new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
-      vars_.emplace_back(out_var_handle);
-      op_handle_->AddOutput(out_var_handle);
-    }
-
-    // add dummy var
-    std::unique_ptr<ir::Node> v4 =
-        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
-    vars_.emplace_back(new DummyVarHandle(v4.get()));
-    DummyVarHandle* out_dummy_var_handle =
-        static_cast<DummyVarHandle*>(vars_.back().get());
-    out_dummy_var_handle->ClearGeneratedOp();
-    op_handle_->AddOutput(out_dummy_var_handle);
-  }
-
-  void TestBroadcastLodTensor(size_t input_scope_idx) {
-    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
-    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-
-    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
-    for (size_t k = 0; k < send_vector.size(); ++k) {
-      send_vector[k] = k;
-    }
-    f::LoD lod{{0, 10, 20}};
-    paddle::framework::TensorFromVector<float>(
-        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
-    in_lod_tensor->set_lod(lod);
-    in_lod_tensor->Resize(kDims);
-
-    op_handle_->Run(false);
-
-    WaitAll();
-
-    p::CPUPlace cpu_place;
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = param_scopes_[j]->FindVar("out");
-      PADDLE_ENFORCE_NOT_NULL(out_var);
-      auto out_tensor = out_var->Get<f::LoDTensor>();
-      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
-
-      f::Tensor result_tensor;
-      f::TensorCopySync(out_tensor, cpu_place, &result_tensor);
-      float* ct = result_tensor.mutable_data<float>(cpu_place);
-
-      for (int64_t i = 0; i < f::product(kDims); ++i) {
-        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
-      }
-    }
-  }
-
-  void TestBroadcastSelectedRows(size_t input_scope_idx) {
-    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
-    auto value = in_selected_rows->mutable_value();
-    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
-    int height = static_cast<int>(kDims[0]) * 2;
-    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
-                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
-    in_selected_rows->set_height(height);
-    in_selected_rows->set_rows(rows);
-
-    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
-    for (size_t k = 0; k < send_vector.size(); ++k) {
-      send_vector[k] = k;
-    }
-    paddle::framework::TensorFromVector<float>(
-        send_vector, *(ctxs_[input_scope_idx]), value);
-
-    op_handle_->Run(false);
-
-    WaitAll();
-
-    p::CPUPlace cpu_place;
-    for (size_t j = 0; j < gpu_list_.size(); ++j) {
-      auto out_var = param_scopes_[j]->FindVar("out");
-      PADDLE_ENFORCE_NOT_NULL(out_var);
-      auto& out_select_rows = out_var->Get<f::SelectedRows>();
-      auto rt = out_select_rows.value();
-
-      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
-                        "height is not equal.");
-      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
-      }
-
-      f::Tensor result_tensor;
-      f::TensorCopySync(rt, cpu_place, &result_tensor);
-      float* ct = result_tensor.data<float>();
-
-      for (int64_t i = 0; i < f::product(kDims); ++i) {
-        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
-      }
-    }
-  }
-};
-
 TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
   TestBroadcastOpHandle test_op;
   size_t input_scope_idx = 0;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
new file mode 100644
index 0000000000..1a2a9ac328
--- /dev/null
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -0,0 +1,271 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+struct TestBroadcastOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  std::vector<Scope*> param_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> place_list_;
+  bool use_gpu_;
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+#ifdef PADDLE_WITH_CUDA
+    if (nccl_ctxs_) {
+      nccl_ctxs_->WaitAll();
+    }
+#endif
+  }
+
+  void InitCtxOnGpu(bool use_gpu) {
+    use_gpu_ = use_gpu;
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        place_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+      nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        place_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+#ifdef PADDLE_WITH_CUDA
+      nccl_ctxs_.reset(nullptr);
+#endif
+    }
+  }
+
+  void InitBroadcastOp(size_t input_scope_idx) {
+    for (size_t j = 0; j < place_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      local_scope.Var("out");
+      param_scopes_.emplace_back(&local_scope);
+    }
+    param_scopes_[input_scope_idx]->Var("input");
+
+    std::unique_ptr<ir::Node> n =
+        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation);
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
+                                             place_list_, nccl_ctxs_.get()));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_,
+                                             place_list_, nccl_ctxs_.get()));
+#else
+      op_handle_.reset(
+          new BroadcastOpHandle(n.get(), local_scopes_, place_list_));
+#endif
+    }
+
+    std::unique_ptr<ir::Node> v =
+        ir::CreateNodeForTest("node1", ir::Node::Type::kVariable);
+    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
+                                        place_list_[input_scope_idx]);
+    vars_.emplace_back(in_var_handle);
+    op_handle_->AddInput(in_var_handle);
+
+    // add dummy var
+
+    std::unique_ptr<ir::Node> v2 =
+        ir::CreateNodeForTest("node2", ir::Node::Type::kVariable);
+    vars_.emplace_back(new DummyVarHandle(v2.get()));
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    dummy_var_handle->ClearGeneratedOp();
+    op_handle_->AddInput(dummy_var_handle);
+
+    for (size_t j = 0; j < place_list_.size(); ++j) {
+      if (!use_gpu_) {
+        op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get());
+      }
+      std::unique_ptr<ir::Node> v3 =
+          ir::CreateNodeForTest("node3", ir::Node::Type::kVariable);
+      VarHandle* out_var_handle =
+          new VarHandle(v3.get(), 2, j, "out", place_list_[j]);
+      vars_.emplace_back(out_var_handle);
+      op_handle_->AddOutput(out_var_handle);
+    }
+
+    // add dummy var
+    std::unique_ptr<ir::Node> v4 =
+        ir::CreateNodeForTest("node4", ir::Node::Type::kVariable);
+    vars_.emplace_back(new DummyVarHandle(v4.get()));
+    DummyVarHandle* out_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    out_dummy_var_handle->ClearGeneratedOp();
+    op_handle_->AddOutput(out_dummy_var_handle);
+  }
+
+  std::vector<float> InitLoDTensor(const std::string& varname,
+                                   size_t input_scope_idx, const f::LoD& lod,
+                                   float val_scalar = 0.0) {
+    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
+
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto lod_tensor = var->GetMutable<f::LoDTensor>();
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k + val_scalar;
+    }
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), lod_tensor);
+    lod_tensor->set_lod(lod);
+    lod_tensor->Resize(kDims);
+    return send_vector;
+  }
+
+  std::vector<float> InitSelectedRows(const std::string& varname,
+                                      size_t input_scope_idx,
+                                      const std::vector<int64_t>& rows,
+                                      int height, float value_scalar = 0.0) {
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k + value_scalar;
+    }
+
+    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto selected_rows = var->GetMutable<f::SelectedRows>();
+    auto value = selected_rows->mutable_value();
+    value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
+    selected_rows->set_height(height);
+    selected_rows->set_rows(rows);
+
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), value);
+
+    return send_vector;
+  }
+
+  void SelectedRowsEqual(const std::string& varname, int input_scope_idx,
+                         const std::vector<float>& send_vector,
+                         const std::vector<int64_t>& rows, int height) {
+    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto& selected_rows = var->Get<f::SelectedRows>();
+    auto rt = selected_rows.value();
+    PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal.");
+
+    for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]);
+    }
+
+    p::CPUPlace cpu_place;
+    f::Tensor result_tensor;
+    f::TensorCopySync(rt, cpu_place, &result_tensor);
+    float* ct = result_tensor.data<float>();
+
+    for (int64_t i = 0; i < f::product(kDims); ++i) {
+      ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
+    }
+  }
+
+  void LoDTensorEqual(const std::string& varname,
+                      const std::vector<float>& send_vec, const f::LoD& lod,
+                      framework::Scope* scope) {
+    p::CPUPlace cpu_place;
+    auto var = scope->FindVar(varname);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto tensor = var->Get<f::LoDTensor>();
+    PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal.");
+    f::Tensor result_tensor;
+    f::TensorCopySync(tensor, cpu_place, &result_tensor);
+    float* ct = result_tensor.mutable_data<float>(cpu_place);
+    for (int64_t k = 0; k < f::product(kDims); ++k) {
+      ASSERT_NEAR(ct[k], send_vec[k], 1e-5);
+    }
+  }
+
+  void TestBroadcastLodTensor(size_t input_scope_idx) {
+    f::LoD lod{{0, 10, 20}};
+    auto send_vector = InitLoDTensor("input", input_scope_idx, lod);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+    for (size_t j = 0; j < place_list_.size(); ++j) {
+      LoDTensorEqual("out", send_vector, lod, param_scopes_[j]);
+    }
+  }
+
+  void TestBroadcastSelectedRows(size_t input_scope_idx) {
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    int height = static_cast<int>(kDims[0] * 2);
+    auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+    for (size_t j = 0; j < place_list_.size(); ++j) {
+      SelectedRowsEqual("out", input_scope_idx, send_vector, rows, height);
+    }
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 6a6b497fa8..bc19bd3661 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
@@ -27,6 +28,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
  public:
   explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
       : ir::PassBuilder(), strategy_(strategy) {
+    if (strategy_.enable_sequential_execution_) {
+      AppendPass("sequential_execution_pass");
+    }
+
     // Add a graph viz pass to record a graph.
     if (!strategy_.debug_graphviz_path_.empty()) {
       auto viz_pass = AppendPass("graph_viz_pass");
@@ -110,6 +115,11 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->Erase("nccl_ctxs");
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
+    } else if (pass->Type() == "sequential_execution_pass") {
+      pass->Erase(kAllOpDescs);
+      pass->Set<const std::vector<OpDesc *>>(
+          kAllOpDescs,
+          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
     }
     graph = pass->Apply(std::move(graph));
   }
@@ -121,6 +131,8 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
+USE_PASS(multi_batch_merge_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
+USE_PASS(sequential_execution_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 02c4bea169..88459320b0 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -69,6 +69,10 @@ struct BuildStrategy {
 
   bool enable_data_balance_{false};
 
+  bool enable_sequential_execution_{false};
+
+  bool fuse_broadcast_op_{false};
+
   // User normally doesn't need to call this API.
   // The PassBuilder allows for more customized insert, remove of passes
   // from python side.
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index b6282debdb..f9bbfe0016 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() {
 bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
   bool need_wait =
       in_var && in_var->GeneratedOp() &&
-      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_);
   return need_wait;
 }
 
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 525d243224..0b772f9b63 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle(
     : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
   if (ctxs) {
     for (auto &p : places_) {
-      this->dev_ctxes_[p] = ctxs->DevCtx(p);
+      this->SetDeviceContext(p, ctxs->DevCtx(p));
     }
   }
 }
@@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() {
   PADDLE_ENFORCE_GT(places_.size(), 1,
                     "Data balance can only be enabled when the number of "
                     "places to run larger than 1.");
-  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
   PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 6e22fedf1c..98fc390e72 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -92,13 +92,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
 
   size_t num_complete = 0;
   remaining_ = 0;
-  BlockingQueue<size_t> complete_q;
+  auto complete_q = std::make_shared<BlockingQueue<size_t>>();
   for (auto op : bootstrap_ops_) {
-    RunOpAsync(op_deps.get(), op, &complete_q);
+    RunOpAsync(op_deps.get(), op, complete_q);
   }
 
   while (num_complete != op_deps->size()) {
-    size_t num_comp = complete_q.Pop();
+    size_t num_comp = complete_q->Pop();
     if (num_comp == -1UL) {
       int remaining = 0;
       while (true) {
@@ -107,7 +107,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
           break;
         }
         for (int i = 0; i < remaining; ++i) {
-          complete_q.Pop();
+          complete_q->Pop();
         }
       }
       exception_.ReThrow();
@@ -120,7 +120,8 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
 }
 void FastThreadedSSAGraphExecutor::RunOpAsync(
     std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-    OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
+    OpHandleBase *op,
+    const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
   ++remaining_;
   this->pool_.enqueue([=] {
     OpHandleBase *op_to_run = op;
@@ -144,7 +145,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
             if (op_to_run == nullptr) {
               op_to_run = pending_op;
             } else {
-              this->RunOpAsync(op_deps, pending_op, complete_q);
+              RunOpAsync(op_deps, pending_op, complete_q);
             }
           }
         }
@@ -156,8 +157,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
 }
 void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
   atomic_op_deps_ = pool_.enqueue([&] {
-    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
-        new std::unordered_map<OpHandleBase *, std::atomic<int>>;
+    auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
     for (auto &pair : op_deps_) {
       (*op_deps)[pair.first] = pair.second;
     }
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index dad3a231cb..8b83824471 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -50,7 +50,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::atomic<int> remaining_;
 
   void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-                  OpHandleBase *op, BlockingQueue<size_t> *complete_q);
+                  OpHandleBase *op,
+                  const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
 
   void PrepareAtomicOpDeps();
 
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
new file mode 100644
index 0000000000..51dfa2d071
--- /dev/null
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
@@ -0,0 +1,55 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void FusedBroadcastOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+
+  if (places_.size() == 1UL) return;
+
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+
+  WaitInputVarGenerated();
+
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
+
+  size_t place_num = places_.size();
+  PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
+
+  for (size_t i = 0; i < in_var_handles.size(); ++i) {
+    BroadcastOneVar(
+        *in_var_handles[i],
+        std::vector<VarHandle *>(out_var_handles.begin() + i * place_num,
+                                 out_var_handles.begin() + (i + 1) * place_num),
+        var_scopes);
+  }
+}
+
+std::string FusedBroadcastOpHandle::Name() const { return "fused_broadcast"; }
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
new file mode 100644
index 0000000000..e37259526a
--- /dev/null
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -0,0 +1,57 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FusedBroadcastOpHandle : public BroadcastOpHandle {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  FusedBroadcastOpHandle(ir::Node *node,
+                         const std::vector<Scope *> local_scopes,
+                         const std::vector<platform::Place> &places,
+                         const platform::NCCLContextMap *nccl_ctx)
+      : BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {}
+#else
+  FusedBroadcastOpHandle(ir::Node* node, const std::vector<Scope*> local_scopes,
+                         const std::vector<platform::Place>& places)
+      : BroadcastOpHandle(node, local_scopes, places) {}
+#endif
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
new file mode 100644
index 0000000000..0f12bd2b4e
--- /dev/null
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -0,0 +1,165 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
+  std::vector<std::string> out_varnames_;
+
+  void InitFusedBroadcastOp(std::vector<size_t> input_scope_idxes) {
+    // initialize scope and var
+    for (size_t i = 0; i < place_list_.size(); ++i) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
+        local_scope.Var("out_var" + j);
+        if (i == j) local_scope.Var("in_var" + j);
+      }
+      param_scopes_.emplace_back(&local_scope);
+    }
+
+    // create op handle node
+    std::unique_ptr<ir::Node> n =
+        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation);
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(new FusedBroadcastOpHandle(
+          n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
+#else
+      PADDLE_THROW("CUDA is not supported.");
+#endif
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(new FusedBroadcastOpHandle(
+          n.get(), local_scopes_, place_list_, nccl_ctxs_.get()));
+#else
+      op_handle_.reset(
+          new FusedBroadcastOpHandle(n.get(), local_scopes_, place_list_));
+#endif
+    }
+
+    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
+      // add input var handle
+      std::unique_ptr<ir::Node> in_node =
+          ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable);
+      VarHandle* in_var_handle =
+          new VarHandle(in_node.get(), 1, input_scope_idxes[i], "in_var" + i,
+                        place_list_[input_scope_idxes[i]]);
+      vars_.emplace_back(in_var_handle);
+      op_handle_->AddInput(in_var_handle);
+
+      // add output var handle
+      for (size_t j = 0; j < place_list_.size(); ++j) {
+        std::unique_ptr<ir::Node> out_node =
+            ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable);
+        VarHandle* out_var_handle =
+            new VarHandle(out_node.get(), 2, j, "out_var" + i, place_list_[j]);
+        vars_.emplace_back(out_var_handle);
+        op_handle_->AddOutput(out_var_handle);
+      }
+    }
+  }
+
+  void TestFusedBroadcastLoDTensor(std::vector<size_t> input_scope_idxes) {
+    std::vector<std::vector<float>> send_vec;
+    f::LoD lod{{0, 10, 20}};
+    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
+      const std::string varname("in_var" + i);
+      float val_scalar = static_cast<float>(i);
+      send_vec.push_back(
+          InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
+    }
+
+    op_handle_->Run(false);
+
+    WaitAll();
+    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
+      const std::string& varname("out_var" + i);
+      for (size_t j = 0; j < place_list_.size(); ++j) {
+        LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
+      }
+    }
+  }
+
+  void TestFusedBroadcastSelectedRows(std::vector<size_t> input_scope_idxes) {
+    std::vector<std::vector<float>> send_vector;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    int height = static_cast<int>(kDims[0] * 2);
+    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
+      const std::string varname("in_var" + i);
+      float val_scalar = static_cast<float>(i);
+      send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
+                                             rows, height, val_scalar));
+    }
+
+    op_handle_->Run(false);
+
+    WaitAll();
+    for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
+      const std::string& varname("out_var" + i);
+      for (size_t j = 0; j < place_list_.size(); ++j) {
+        SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
+                          height);
+      }
+    }
+  }
+};
+
+TEST(FusedBroadcastTester, CPULodTensor) {
+  TestFusedBroadcastOpHandle test_op;
+  std::vector<size_t> input_scope_idxes = {0, 1};
+  test_op.InitCtxOnGpu(false);
+  test_op.InitFusedBroadcastOp(input_scope_idxes);
+  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
+}
+
+TEST(FusedBroadcastTester, CPUSelectedRows) {
+  TestFusedBroadcastOpHandle test_op;
+  std::vector<size_t> input_scope_idxes = {0, 1};
+  test_op.InitCtxOnGpu(false);
+  test_op.InitFusedBroadcastOp(input_scope_idxes);
+  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(FusedBroadcastTester, GPULodTensor) {
+  TestFusedBroadcastOpHandle test_op;
+  std::vector<size_t> input_scope_idxes = {0, 1};
+  test_op.InitCtxOnGpu(true);
+  test_op.InitFusedBroadcastOp(input_scope_idxes);
+  test_op.TestFusedBroadcastLoDTensor(input_scope_idxes);
+}
+
+TEST(FusedBroadcastTester, GPUSelectedRows) {
+  TestFusedBroadcastOpHandle test_op;
+  std::vector<size_t> input_scope_idxes = {0, 1};
+  test_op.InitCtxOnGpu(true);
+  test_op.InitFusedBroadcastOp(input_scope_idxes);
+  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
+}
+#endif
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 9aae19fc73..ca4633c5a8 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() {
 
   VarHandle *out_var_handle;
   {
-    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
     PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
                       "The number of output should be one.");
     out_var_handle = out_var_handles.front();
@@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() {
   Tensor *out_tensor = out_value->mutable_value();
 
   // copy
-  auto dev_ctx = dev_ctxes_[out_var_handle->place_];
+  auto dev_ctx = dev_ctxes_.at(out_var_handle->place_);
   RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx,
                                              t_out_p] {
     int s = 0, e = 0;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 134fcee826..f3819887a1 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/data_balance_op_handle.h"
+#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
@@ -252,9 +253,9 @@ std::vector<ir::Node *> SortOpsAndDelayOptimizeOp(const ir::Graph &graph) {
   std::vector<ir::Node *> sorted_ret;
   for (size_t i = 0; i < ret.size(); ++i) {
     if (i < last_backward) {
-      if (boost::get<int>(ret[i]->Op()->GetAttr(
-              OpProtoAndCheckerMaker::OpRoleAttrName())) ==
-          static_cast<int>(OpRole::kOptimize)) {
+      if (static_cast<bool>(boost::get<int>(ret[i]->Op()->GetAttr(
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kOptimize))) {
         optimize_ops.push_back(ret[i]);
       } else {
         sorted_ret.push_back(ret[i]);
@@ -347,7 +348,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
           BuildStrategy::GradientScaleStrategy::kCustomized) {
         // TODO(paddle-dev): Why is there no input for this op_handle?
         auto loss_grad_name = node->Op()->OutputArgumentNames()[0];
-        CreateScaleLossGradOp(&result, loss_grad_name);
+        CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]);
       }
       // This assumes the backward generating code will ensure IsScaleLossOp
       // is true only for the op that scale the final scalar loss.
@@ -436,10 +437,14 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   if ((use_gpu &&
        strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
       is_dist_train) {
-    for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
-      auto &to_bcast_set = bcast_var_name_set[dev_id];
-      for (auto &bcast_name : to_bcast_set) {
-        CreateBroadcastOp(&result, bcast_name, dev_id);
+    if (strategy_.fuse_broadcast_op_) {
+      CreateFusedBroadcastOp(&result, bcast_var_name_set);
+    } else {
+      for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
+        auto &to_bcast_set = bcast_var_name_set[dev_id];
+        for (auto &bcast_name : to_bcast_set) {
+          CreateBroadcastOp(&result, bcast_name, dev_id);
+        }
       }
     }
   }
@@ -508,6 +513,44 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
   }
 }
 
+void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
+    ir::Graph *result,
+    const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
+#ifdef PADDLE_WITH_CUDA
+  auto *op_handle = new FusedBroadcastOpHandle(
+      result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_);
+#else
+  auto *op_handle = new FusedBroadcastOpHandle(
+      result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
+      local_scopes_, places_);
+#endif
+  result->Get<GraphOps>(kGraphOps).emplace_back(op_handle);
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+  }
+
+  for (size_t dev_id = 0; dev_id < bcast_varnames.size(); ++dev_id) {
+    for (auto &p_name : bcast_varnames[dev_id]) {
+      auto *in =
+          result->Get<GraphVars>(kGraphVars).at(dev_id).at(p_name).back().get();
+      op_handle->AddInput(in);
+      for (size_t out_dev_id = 0; out_dev_id < places_.size(); ++out_dev_id) {
+        auto &p = places_[out_dev_id];
+        auto &vars =
+            result->Get<GraphVars>(kGraphVars).at(out_dev_id).at(p_name);
+        auto *out_var = new VarHandle(
+            result->CreateEmptyNode(p_name, ir::Node::Type::kVariable),
+            vars.size(), out_dev_id, p_name, p);
+        vars.emplace_back(out_var);
+        op_handle->AddOutput(out_var);
+      }
+    }
+  }
+}
+
 void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                     ir::Node *node,
                                                     int dev_id) const {
@@ -602,7 +645,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph,
 }
 
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
-    ir::Graph *result, const std::string &loss_grad_name) const {
+    ir::Graph *result, const std::string &loss_grad_name,
+    ir::Node *out_var_node) const {
   for (size_t i = 0; i < places_.size(); ++i) {
     // Insert ScaleCost OpHandle
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]);
@@ -617,10 +661,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(
     // loss->pending_ops_.emplace_back(op_handle);
     // op_handle->inputs_.emplace_back(loss);
 
-    CreateOpOutput(
-        result, op_handle,
-        result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable),
-        places_[i], i);
+    CreateOpOutput(result, op_handle,
+                   result->CreateVarNode(out_var_node->Var()), places_[i], i);
   }
 }
 
@@ -680,7 +722,8 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   }
 
   if (node->Op()->Type() == "split_byref" ||
-      node->Op()->Type() == "split_selected_rows") {
+      node->Op()->Type() == "split_selected_rows" ||
+      node->Op()->Type() == "split_ids") {
     // TODO(paddle-dev): getting the first var is not safe.
     op_dev_id = GetVarDeviceID(*result, input_var_names[0]);
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index cdf9f13cde..03b2de2f04 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -61,7 +61,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                               size_t num_places) const;
 
   void CreateScaleLossGradOp(ir::Graph *result,
-                             const std::string &loss_grad_name) const;
+                             const std::string &loss_grad_name,
+                             ir::Node *out_var_node) const;
 
   VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og,
                             int dst_dev_id) const;
@@ -78,6 +79,10 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
+  void CreateFusedBroadcastOp(
+      ir::Graph *result,
+      const std::vector<std::unordered_set<std::string>> &bcast_varnames) const;
+
   bool IsSparseGradient(const std::string &og) const;
 
   size_t GetAppropriateDeviceID(
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 3812f0abf1..4822627ac3 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
   for (auto *in : inputs_) {
     if (NeedWait(in)) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]);
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
     }
   }
 }
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 7fc06f234d..4503123eac 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -27,7 +27,7 @@ namespace framework {
 namespace details {
 
 void ReduceOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
 
   if (places_.size() == 1) return;
   // the input and output may have dummy var.
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index a6289b055f..999828ae45 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase {
         nccl_ctxs_(nccl_ctxs) {
     if (nccl_ctxs_) {
       for (auto &p_ctx : nccl_ctxs_->contexts_) {
-        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+        this->SetDeviceContext(platform::CUDAPlace(p_ctx.first),
+                               p_ctx.second.ctx_.get());
       }
     }
   }
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index f44b374edb..65df7f2d51 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() {
       continue;
     }
     if (in->GeneratedOp()) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]);
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p));
     }
   }
   auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index ba243979b3..ef16265997 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
       coeff_(static_cast<float>(1.0 / num_dev)),
       scope_(scope),
       place_(place) {
-  dev_ctxes_[place_] = dev_ctx;
+  this->SetDeviceContext(place_, dev_ctx);
 }
 
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
@@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() {
   } else {
 #ifdef PADDLE_WITH_CUDA
     this->RunAndRecordEvent([&] {
-      auto stream =
-          static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
-              ->stream();
+      auto stream = static_cast<platform::CUDADeviceContext *>(
+                        this->dev_ctxes_.at(place_))
+                        ->stream();
       memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                    platform::CPUPlace(), &coeff_, sizeof(float), stream);
       VLOG(10) << place_ << "RUN Scale loss grad op";
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc
new file mode 100644
index 0000000000..cc2c8bfef9
--- /dev/null
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/sequential_execution_pass.h"
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
+std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  // FIXME(zjl): Insert dependencies between some distributed ops may cause
+  // the multi_devices_graph_pass fails. So we skip these ops here.
+  // Indeed, maybe we should not insert dependencies between these ops
+  // casually, which may cause deadlock easily.
+  // We should add more skipped distributed ops when found errors in
+  // multi_devices_graph_pass
+  static std::unordered_set<std::string> skip_dist_ops{
+      "send", "recv", "send_barrier", "fetch_barrier"};
+
+  auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs);
+  std::vector<ir::Node *> op_node_list;
+  op_node_list.reserve(ops.size());
+
+  std::unordered_map<ir::Node *, size_t> op_deps;
+  std::unordered_map<ir::Node *, std::unordered_set<ir::Node *>> pending_ops;
+  std::unordered_set<ir::Node *> ready_ops;
+
+  for (ir::Node *node : graph->Nodes()) {
+    if (!node->IsOp()) continue;
+    std::unordered_set<ir::Node *> preceding_ops;
+    for (auto *in : node->inputs) {
+      PADDLE_ENFORCE(in->IsVar(),
+                     "Preceding Node of Op Nodes must be Var Node");
+      if (in->inputs.empty()) continue;
+      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp(),
+                     "Preceding Op Node of Var Node must be unique");
+      preceding_ops.insert(in->inputs[0]);
+      pending_ops[in->inputs[0]].insert(node);
+    }
+    op_deps[node] = preceding_ops.size();
+    if (preceding_ops.empty()) {
+      ready_ops.insert(node);
+    }
+  }
+
+  for (auto *op_desc : ops) {
+    ir::Node *found_node = nullptr;
+    for (auto *node : ready_ops) {
+      if (IsSameOpDesc(op_desc, node->Op())) {
+        PADDLE_ENFORCE(found_node == nullptr,
+                       "Found multiple op_desc in graph: %s", op_desc->Type());
+        found_node = node;
+      }
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(found_node, "Cannot find op_desc in graph: %s",
+                            op_desc->Type());
+    for (auto *pending_op : pending_ops[found_node]) {
+      if (--op_deps.at(pending_op) == 0) {
+        ready_ops.insert(pending_op);
+      }
+    }
+    ready_ops.erase(found_node);
+    if (skip_dist_ops.count(op_desc->Type()) == 0) {
+      op_node_list.push_back(found_node);
+    }
+  }
+
+  for (size_t i = 1; i < op_node_list.size(); ++i) {
+    auto *dep_var = graph->CreateControlDepVar();
+    op_node_list[i]->inputs.push_back(dep_var);
+    op_node_list[i - 1]->outputs.push_back(dep_var);
+    dep_var->outputs.push_back(op_node_list[i]);
+    dep_var->inputs.push_back(op_node_list[i - 1]);
+    VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
+             << " and " << op_node_list[i]->Name();
+  }
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(sequential_execution_pass,
+              paddle::framework::details::SequentialExecutionPass)
+    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.h b/paddle/fluid/framework/details/sequential_execution_pass.h
new file mode 100644
index 0000000000..a04c08bc2e
--- /dev/null
+++ b/paddle/fluid/framework/details/sequential_execution_pass.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char kAllOpDescs[] = "all_op_descs";
+
+class SequentialExecutionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 31beef3ae8..dc63effd1b 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -39,7 +39,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
       new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
   std::unordered_set<VarHandleBase *> pending_vars;
-  BlockingQueue<VarHandleBase *> ready_vars;
+  auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
   std::unordered_set<OpHandleBase *> ready_ops;
   // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
   // streams from multiple GPUs, it's faster to buffer them and schedule
@@ -51,12 +51,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
+        InsertPendingVar(&pending_vars, ready_vars.get(), version_pair.get());
       }
     }
   }
   for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    InsertPendingVar(&pending_vars, &ready_vars, var.get());
+    InsertPendingVar(&pending_vars, ready_vars.get(), var.get());
   }
 
   for (auto &op : graph_->Get<details::GraphOps>(details::kGraphOps)) {
@@ -73,12 +73,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   FeedFetchList fetch_data(fetch_tensors.size());
 
   InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
-                 &pending_vars, &ready_vars, &fetch_data);
+                 &pending_vars, ready_vars.get(), &fetch_data);
 
   auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
     for (auto *op : set) {
       running_ops_++;
-      RunOp(&ready_vars, op);
+      RunOp(ready_vars, op);
     }
     set.clear();
   };
@@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   run_op_futures_.clear();
   exception_holder_.Clear();
   event.reset(nullptr);
-
   // Step 3. Execution
   while (!pending_vars.empty()) {
     // 1. Run All Ready ops
@@ -103,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
     // 2. Find ready variable
     bool timeout;
-    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
+    auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
 
     if (timeout) {
       if (exception_holder_.IsCaught()) {
@@ -133,7 +132,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     }
   }
   PADDLE_ENFORCE(ready_ops.empty());
-
   // Wait FetchOps.
   ClearFetchOp(graph_.get(), &fetch_ops);
 
@@ -206,7 +204,8 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
 }
 
 void ThreadedSSAGraphExecutor::RunOp(
-    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
+    const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
+    details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
     try {
       if (VLOG_IS_ON(10)) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 512f8a4ca5..dbb0b498d9 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -51,7 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ~ThreadedSSAGraphExecutor() {}
 
  private:
-  void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
+  void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
              details::OpHandleBase *op);
 
  private:
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 25f0ba4184..efdabffb9b 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -35,6 +35,7 @@ enum AttrType {
   BLOCK = 8;
   LONG = 9;
   BLOCKS = 10;
+  LONGS = 11;
 }
 
 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -55,6 +56,7 @@ message OpDesc {
     optional int32 block_idx = 12;
     optional int64 l = 13;
     repeated int32 blocks_idx = 14;
+    repeated int64 longs = 15;
   };
 
   message Var {
@@ -80,7 +82,6 @@ message OpProto {
     optional bool duplicable = 3 [ default = false ];
     optional bool intermediate = 4 [ default = false ];
     optional bool dispensable = 5 [ default = false ];
-    optional string reuse = 6;
   }
 
   // AttrProto describes the C++ type Attribute.
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 3aa2c7b9ea..28231a53ba 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -36,18 +36,18 @@ pass_library(fc_lstm_fuse_pass inference)
 pass_library(embedding_fc_lstm_fuse_pass inference)
 pass_library(fc_gru_fuse_pass inference)
 pass_library(seq_concat_fc_fuse_pass inference)
+pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
 if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base)
+    pass_library(depthwise_conv_mkldnn_pass base)
     pass_library(conv_bias_mkldnn_fuse_pass inference)
     pass_library(conv_relu_mkldnn_fuse_pass inference)
+    pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
 endif()
 
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
-if(WITH_MKLDNN)
-  pass_library(conv_elementwise_add_mkldnn_fuse_pass inference)
-endif()
 
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
 
@@ -60,6 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 if (WITH_MKLDNN)
+    cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
     cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
 endif ()
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
index b5de0d5487..fe585bd7c4 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase {
   virtual ~ConvReLUFusePass() {}
 
  protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
index 8f4bab25ed..19248b4dfe 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
 
 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
 namespace framework {
@@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("X", inputs);
   }
   op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
 }
 
 // a->OP0->b
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
new file mode 100644
index 0000000000..19056e18aa
--- /dev/null
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_NODE(id, pattern)                               \
+  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
+                 "pattern has no Node called %s", #id);     \
+  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+
+std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get());
+  GraphPatternDetector gpd;
+
+  auto* pattern = gpd.mutable_pattern();
+  pattern->NewNode("depthwise_conv")
+      ->assert_is_op("depthwise_conv2d")
+      ->assert_op_attr("use_mkldnn", true);
+
+  int found_depthwise_conv_mkldnn_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
+    GET_NODE(depthwise_conv, (*pattern));
+    depthwise_conv->Op()->SetType("conv2d");
+    found_depthwise_conv_mkldnn_count++;
+  };
+
+  gpd(graph.get(), handler);
+  AddStatis(found_depthwise_conv_mkldnn_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(depthwise_conv_mkldnn_pass,
+              paddle::framework::ir::DepthwiseConvMKLDNNPass);
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
new file mode 100644
index 0000000000..8ca6a73251
--- /dev/null
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class DepthwiseConvMKLDNNPass : public FusePassBase {
+ public:
+  virtual ~DepthwiseConvMKLDNNPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
new file mode 100644
index 0000000000..09d0b15f46
--- /dev/null
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  op->SetInput("Input", {inputs[0]});
+  op->SetInput("Filter", {inputs[1]});
+  op->SetInput("Bias", {inputs[2]});
+  op->SetOutput("Out", outputs);
+}
+
+// (a, weights, bias)->depthwise conv mkldnn->b
+// (b, weights2, bias2)->depthwise conv no mkldnn->c
+// (c, weights3, bias3)->conv mkldnn->d
+// (d, weights3, bias3)->conv no mkldnn->e
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : std::vector<std::string>(
+           {"a", "b", "c", "d", "e", "weights", "bias", "weights2", "bias2",
+            "weights3", "bias3", "weights4", "bias4"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2" ||
+        v == "weights3" || v == "bias3" || v == "weights4" || v == "bias4") {
+      var->SetPersistable(true);
+    }
+  }
+
+  // depthwise conv with MKL-DNN
+  SetOp(&prog, "depthwise_conv2d", "conv1",
+        std::vector<std::string>({"a", "weights", "bias"}),
+        std::vector<std::string>({"b"}), true);
+  // depthwise conv without MKL-DNN
+  SetOp(&prog, "depthwise_conv2d", "conv2",
+        std::vector<std::string>({"b", "weights2", "bias2"}),
+        std::vector<std::string>({"c"}), false);
+  // conv with MKL-DNN
+  SetOp(&prog, "conv2d", "conv3",
+        std::vector<std::string>({"c", "weights3", "bias3"}),
+        std::vector<std::string>({"d"}), true);
+  // conv without MKL-dNN
+  SetOp(&prog, "conv2d", "conv4",
+        std::vector<std::string>({"d", "weights4", "bias4"}),
+        std::vector<std::string>({"e"}), false);
+
+  return prog;
+}
+
+TEST(DepthwiseConvMKLDNNPass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass");
+
+  struct counters {
+    int mkldnn_depthwise_conv_nodes;
+    int other_depthwise_conv_nodes;
+    int mkldnn_conv_nodes;
+    int other_conv_nodes;
+  };
+
+  counters before{1, 1, 1, 1};
+
+  graph = pass->Apply(std::move(graph));
+
+  // initialize counters before loop
+  counters after{0, 0, 0, 0};
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
+          after.mkldnn_conv_nodes++;
+        else
+          after.other_conv_nodes++;
+      } else if (op->Type() == "depthwise_conv2d") {
+        if (boost::get<bool>(op->GetAttr("use_mkldnn")))
+          after.mkldnn_depthwise_conv_nodes++;
+        else
+          after.other_depthwise_conv_nodes++;
+      }
+    }
+  }
+
+  EXPECT_EQ(after.other_depthwise_conv_nodes,
+            before.other_depthwise_conv_nodes);
+  EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes);
+  EXPECT_EQ(after.mkldnn_depthwise_conv_nodes,
+            before.mkldnn_depthwise_conv_nodes - 1);
+  EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(depthwise_conv_mkldnn_pass);
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index 06286a109d..2db7d95cae 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 
 #include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
 namespace framework {
@@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
     op->SetInput("X", inputs);
   }
   op->SetOutput("Out", outputs);
+  op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(OpRole::kForward));
 }
 
 // a->OP0->b
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 398f709596..813f620d7c 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -23,80 +23,78 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
-
-std::vector<std::string> FindDistTrainSendVars(
-    const std::vector<ir::Node *> &nodes) {
-  std::vector<std::string> send_vars;
-  // since parameters are all in block 0,
-  // it's enough to only scan send ops in block 0
-  for (auto &node : nodes) {
-    auto op_vars = node->Op()->InputArgumentNames();
-    send_vars.reserve(send_vars.size() +
-                      std::distance(op_vars.begin(), op_vars.end()));
-    send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
-  }
-  return send_vars;
-}
-
-std::vector<std::string> FindDistTrainRecvVars(
-    const std::vector<ir::Node *> &nodes) {
-  std::vector<std::string> recv_vars;
-  for (auto &node : nodes) {
-    auto op_vars = node->Op()->OutputArgumentNames();
-    recv_vars.reserve(recv_vars.size() +
-                      std::distance(op_vars.begin(), op_vars.end()));
-    recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
-  }
-  return recv_vars;
-}
-
-bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
-                   const std::vector<std::string> &recv_vars) {
-  if (send_vars.size() == 0 || recv_vars.size() == 0) {
-    return false;
-  }
-
-  /**
-   * Check any of opvars contains `.block` and in sendvars
-   */
-  auto checker = [](const std::vector<std::string> &opvars,
-                    const std::vector<std::string> &rpc_vars) -> bool {
-    for (auto &var : opvars) {
-      // a variable name with the suffix `.block` means it's a splited
-      // variable by (DistributeTranspiler)
-      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
-      if (var.find(".block") != std::string::npos &&
-          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
-        return true;
+namespace {
+
+void CheckProgram(const ProgramDesc &program) {
+  std::map<int, bool> visit;
+#define _INT(role) static_cast<int>(role)
+
+  for (size_t i = 0; i < program.Size(); ++i) {
+    for (OpDesc *op : program.Block(i).AllOps()) {
+      // For backward compatibility, some program doesn't have role added.
+      if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue;
+      int role_id = boost::get<int>(
+          op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+      visit[role_id] = true;
+      switch (role_id) {
+        case _INT(OpRole::kForward):
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kBackward)) == visit.end(),
+              "Cannot add forward operator before backward operator.");
+          break;
+        case _INT(OpRole::kBackward):
+        case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+              "Cannot add backward operator before optimize operator.");
+          break;
+        case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
+          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
+                                    _INT(OpRole::kLoss)) == visit.end(),
+                         "Cannot add backward|loss operator before "
+                         "forward|loss operator.");
+          PADDLE_ENFORCE(
+              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+              "Cannot add backward operator before optimize operator.");
+          break;
+        case _INT(OpRole::kOptimize):
+        case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
+          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
+                         "Optimize operators must follow backward operator.");
+          break;
+        case _INT(OpRole::kLRSched):
+        case _INT(OpRole::kDist):
+        case _INT(OpRole::kRPC):
+        case _INT(OpRole::kNotSpecified):
+          break;
+        default:
+          LOG(FATAL) << "Unknown operator role. Don't add new role because "
+                        "you don't know what you are doing.";
       }
     }
-    return false;
-  };
-
-  std::vector<std::string> input_var_names;
-  std::vector<std::string> output_var_names;
-  for (ir::Node *input : node->inputs) {
-    input_var_names.push_back(input->Name());
   }
-  for (ir::Node *output : node->outputs) {
-    output_var_names.push_back(output->Name());
-  }
-
-  return checker(output_var_names, send_vars) ||
-         checker(input_var_names, recv_vars);
+#undef _INT
 }
+}  // namespace
 
 Graph::Graph(const ProgramDesc &program) : program_(program) {
+  CheckProgram(program_);
   // Make the nodes id start from 0.
   Node::ResetId();
+  auto var_nodes = InitFromProgram(program_);
+  ResolveHazard(var_nodes);
+}
 
+std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
+    const ProgramDesc &program) {
   VLOG(3) << "block in program:" << program_.Size();
   std::unordered_map<std::string, VarDesc *> all_vars;
+  // var nodes for each var name, will have multiple versions in SSA
+  std::map<std::string, std::vector<ir::Node *>> var_nodes;
   for (auto *var : program.Block(0).AllVars()) {
     all_vars.emplace(var->Name(), var);
   }
 
-  std::map<std::string, std::vector<ir::Node *>> var_nodes;
   for (auto *op : program.Block(0).AllOps()) {
     ir::Node *node = CreateOpNode(op);
     // For input args, reuse the same var name if it was created before.
@@ -134,7 +132,11 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
       var->inputs.push_back(node);
     }
   }
+  return std::move(var_nodes);
+}
 
+void Graph::ResolveHazard(
+    const std::map<std::string, std::vector<ir::Node *>> &var_nodes) {
   /**
    * We should handle write after read(WAR) and write after write(WAW) here.
    * Because some of the operators of the program can be executed parallelly.
@@ -153,6 +155,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
     auto it_old = versions.rbegin();
     ++it_old;
     for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
+      VLOG(3) << "deal with var: " << (*it_new)->Name();
       ir::Node *write_op =
           (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
       const auto &read_ops = (*it_old)->outputs;
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index ab687e760a..9d7aa5d32d 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -160,6 +160,12 @@ class Graph {
     return nullptr;
   }
 
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
+
+  void ResolveHazard(
+      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
+
  private:
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index c54766d95a..01e8780891 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -120,19 +120,25 @@ size_t GraphNum(const Graph &graph) {
   std::deque<ir::Node *> q_nodes;
   std::vector<std::unordered_set<ir::Node *>> graph_nodes;
   std::unordered_set<ir::Node *> g_nodes;
+  // q_set used to record records in the queue.
+  std::unordered_set<ir::Node *> q_set;
   size_t graph_count = 0;
 
-  auto traverse_nodes = [&visited_nodes,
-                         &q_nodes](const std::vector<ir::Node *> &nodes) {
-    std::copy_if(
-        nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
-        [&visited_nodes](Node *node) { return !visited_nodes.count(node); });
+  auto traverse_nodes = [&visited_nodes, &q_nodes,
+                         &q_set](const std::vector<ir::Node *> &nodes) {
+    for (auto n : nodes) {
+      if (visited_nodes.count(n) == 0 && q_set.count(n) == 0) {
+        q_nodes.push_back(n);
+        q_set.insert(n);
+      }
+    }
   };
 
   while (visited_nodes.size() != nodes.size()) {
     if (!q_nodes.empty()) {
       auto cur_node = q_nodes.front();
       q_nodes.pop_front();
+      q_set.erase(cur_node);
       visited_nodes.insert(cur_node);
       g_nodes.insert(cur_node);
       traverse_nodes(cur_node->inputs);
@@ -146,6 +152,7 @@ size_t GraphNum(const Graph &graph) {
       for (auto &n : nodes) {
         if (visited_nodes.count(n) == 0) {
           q_nodes.push_back(n);
+          q_set.insert(n);
           break;
         }
       }
diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc
index cea9028093..260a73ae76 100644
--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -200,15 +200,15 @@ TEST(GraphHelperTest, GraphNum) {
 
   Graph g(prog);
   BuildZeroGraph(&g);
-  ASSERT_EQ(GraphNum(g), 0);
+  ASSERT_EQ(GraphNum(g), 0UL);
 
   Graph g2(prog);
   BuildOneGraph(&g2);
-  ASSERT_EQ(GraphNum(g2), 1);
+  ASSERT_EQ(GraphNum(g2), 1UL);
 
   Graph g3(prog);
   BuildTwoGraphs(&g3);
-  ASSERT_EQ(GraphNum(g3), 2);
+  ASSERT_EQ(GraphNum(g3), 2UL);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 29b604afbf..b20d701322 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() {
   return result;
 }
 
+bool GraphItemCMP(const std::pair<PDNode *, Node *> &a,
+                  const std::pair<PDNode *, Node *> &b) {
+  if (a.first != b.first) {
+    return a.first < b.first;
+  } else {
+    return a.second < b.second;
+  }
+}
+
 // TODO(Superjomn) enhance the function as it marks unique unique as duplicates
 // see https://github.com/PaddlePaddle/Paddle/issues/13550
 void GraphPatternDetector::UniquePatterns(
@@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns(
   std::vector<GraphPatternDetector::subgraph_t> result;
 
   std::unordered_set<size_t> set;
+  std::hash<std::string> hasher;
   for (auto &g : *subgraphs) {
-    size_t key = 0;
-    for (auto &item : g) {
-      key ^= std::hash<void *>{}(item.first);
-      key ^= std::hash<void *>{}(item.second);
+    // Sort the items in the sub-graph, and transform to a string key.
+    std::vector<std::pair<PDNode *, Node *>> sorted_keys(g.begin(), g.end());
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP);
+    std::stringstream ss;
+    for (auto &item : sorted_keys) {
+      ss << item.first << ":" << item.second;
     }
+    auto key = hasher(ss.str());
     if (!set.count(key)) {
       result.emplace_back(g);
       set.insert(key);
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index cadda49c39..7ed2f96eb2 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -124,7 +124,7 @@ TEST(GraphTest, Basic) {
       ASSERT_EQ(n->outputs.size(), 0UL);
     }
   }
-  ASSERT_EQ(nodes.size(), 5);
+  ASSERT_EQ(nodes.size(), 5UL);
 }
 
 TEST(GraphTest, WriteAfterRead) {
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
new file mode 100644
index 0000000000..bd5b76426e
--- /dev/null
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -0,0 +1,315 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/multi_batch_merge_pass.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+static const char kNumRepeats[] = "num_repeats";
+typedef std::unordered_map<std::string, std::vector<ir::Node*>> SSAVarList;
+
+ir::Node* SameNameVar(std::unordered_set<ir::Node*> all, ir::Node* target) {
+  for (auto n : all) {
+    if (target->IsVar() && target->Name() == n->Name()) {
+      return n;
+    }
+  }
+  return nullptr;
+}
+
+VarDesc CopyVarDesc(VarDesc* var_desc) {
+  VarDesc repeated_var(var_desc->Name());
+  // copy other variable attributes
+  if (var_desc->GetType() != proto::VarType::READER) {
+    repeated_var.SetType(var_desc->GetType());
+    repeated_var.SetShape(var_desc->GetShape());
+    repeated_var.SetDataType(var_desc->GetDataType());
+    repeated_var.SetLoDLevel(var_desc->GetLoDLevel());
+    repeated_var.SetPersistable(var_desc->Persistable());
+  } else {
+    // TODO(typhoonzero): copy reader var
+  }
+  return repeated_var;
+}
+
+VarDesc UpdateGradVarDesc(
+    VarDesc* var_desc, int repeat,
+    const std::unordered_set<std::string>& grad_names,
+    const std::unordered_set<std::string>& bn_vars_need_rename) {
+  if (grad_names.find(var_desc->Name()) != grad_names.end() ||
+      bn_vars_need_rename.find(var_desc->Name()) != bn_vars_need_rename.end()) {
+    std::string new_gname =
+        string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat);
+    VarDesc repeated_var = CopyVarDesc(var_desc);
+    repeated_var.SetName(new_gname);
+    VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat;
+    return repeated_var;
+  }
+  return *var_desc;
+}
+
+std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
+    std::unique_ptr<Graph> graph) const {
+  int num_repeats = Get<const int>(kNumRepeats);
+  std::vector<Node*> forward_backward_ops;
+  std::vector<Node*> optimize_ops;
+  std::vector<Node*> lr_ops;  // ops other than forward/backward/optimize
+  std::unordered_set<std::string> grad_names;
+
+  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
+  auto origin_nodes = graph->ReleaseNodes();
+  VLOG(3) << "origin nodes count: " << origin_nodes.size();
+  ir::Graph& result = *graph;
+
+  // 1. record op nodes of different roles
+  for (auto node : nodes) {
+    if (node->IsVar()) continue;
+    int op_role = boost::get<int>(node->Op()->GetAttr(
+        framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+    if ((op_role == static_cast<int>(framework::OpRole::kForward)) ||
+        (op_role & static_cast<int>(framework::OpRole::kBackward)) ||
+        (op_role & static_cast<int>(framework::OpRole::kLoss))) {
+      forward_backward_ops.push_back(node);
+    } else if ((op_role & static_cast<int>(framework::OpRole::kOptimize)) ||
+               (op_role & static_cast<int>(framework::OpRole::kDist)) ||
+               (op_role & static_cast<int>(framework::OpRole::kRPC))) {
+      optimize_ops.push_back(node);
+      auto op_role_var = node->Op()->GetNullableAttr(
+          OpProtoAndCheckerMaker::OpRoleVarAttrName());
+      auto op_role_vars = boost::get<std::vector<std::string>>(op_role_var);
+      for (size_t i = 0; i < op_role_vars.size(); i += 2) {
+        grad_names.insert(op_role_vars[i + 1]);
+      }
+    } else if (op_role & static_cast<int>(framework::OpRole::kLRSched)) {
+      lr_ops.push_back(node);
+    } else {  // NOLINT
+      PADDLE_THROW("Invalid op_role: %d", static_cast<int>(op_role));
+    }
+  }
+
+  // 2. copy forward backward
+  ir::Node* prev_repeat_last_op_node = nullptr;
+  // record origin_grad -> repeated grad list map.
+  std::map<ir::Node*, std::vector<ir::Node*>> grad_repeated_map;
+  std::map<std::string, std::vector<ir::Node*>> created;
+  std::unordered_set<std::string> bn_vars_need_rename;
+  for (int i = 0; i < num_repeats; ++i) {
+    std::unordered_set<ir::Node*> copied;
+    for (size_t node_idx = 0; node_idx < forward_backward_ops.size();
+         ++node_idx) {
+      auto node = forward_backward_ops[node_idx];
+      OpDesc repeated_op(*(node->Op()), node->Op()->Block());
+      // 3. rename grad outputs to current repeat.
+      for (auto outname : repeated_op.OutputArgumentNames()) {
+        if (grad_names.find(outname) != grad_names.end()) {
+          std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i);
+          repeated_op.RenameOutput(outname, new_gname);
+        }
+      }
+      // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do
+      // not need this update
+      if (node->Name() == "batch_norm") {
+        // NOTE: assume bn op created by layers use save var as output mean and
+        // variance
+        std::string new_mean_name =
+            string::Sprintf("%s.repeat.%d", repeated_op.Input("Mean")[0], i);
+        std::string new_var_name = string::Sprintf(
+            "%s.repeat.%d", repeated_op.Input("Variance")[0], i);
+        bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]);
+        bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]);
+        VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to "
+                << new_mean_name;
+        repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name);
+        repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name);
+        repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0],
+                                 new_mean_name);
+        repeated_op.RenameOutput(repeated_op.Output("VarianceOut")[0],
+                                 new_var_name);
+      }
+
+      // 3.9 do copy
+      auto repeated_node = result.CreateOpNode(&repeated_op);
+      copied.insert(node);
+
+      // 4. add deps between repeats
+      if (node_idx == forward_backward_ops.size() - 1) {
+        prev_repeat_last_op_node = repeated_node;
+      }
+      if (node_idx == 0 && prev_repeat_last_op_node) {
+        auto* depvar = result.CreateControlDepVar();
+        prev_repeat_last_op_node->outputs.push_back(depvar);
+        depvar->inputs.push_back(prev_repeat_last_op_node);
+        repeated_node->inputs.push_back(depvar);
+        depvar->outputs.push_back(repeated_node);
+      }
+
+      for (auto in_node : node->inputs) {
+        if (in_node->IsCtrlVar()) {
+          continue;
+        }
+        ir::Node* var = nullptr;
+        auto updated_var = UpdateGradVarDesc(in_node->Var(), i, grad_names,
+                                             bn_vars_need_rename);
+        // should be initialized by startup, how to initilize tensor in the
+        // scope?
+        if (node->Name() == "batch_norm" &&
+            bn_vars_need_rename.find(in_node->Name()) !=
+                bn_vars_need_rename.end()) {
+          // Create bn mean/variance for each repeat
+          var = result.CreateVarNode(&updated_var);
+          created[updated_var.Name()].push_back(var);
+          copied.insert(in_node);
+          repeated_node->inputs.push_back(var);
+          var->outputs.push_back(repeated_node);
+          continue;
+        }
+
+        // for other ops
+        if (in_node->inputs.empty() && i > 0) {
+          // do not copy head vars (inputs, params) in repeats > 0
+          var = created.at(in_node->Name()).back();
+        } else {
+          if (copied.find(in_node) == copied.end()) {
+            var = result.CreateVarNode(&updated_var);
+            if (grad_names.find(in_node->Var()->Name()) != grad_names.end()) {
+              grad_repeated_map[in_node].push_back(var);
+            }
+            copied.insert(in_node);
+            created[updated_var.Name()].push_back(var);
+          } else {
+            var = created.at(updated_var.Name()).back();
+          }
+        }
+        repeated_node->inputs.push_back(var);
+        var->outputs.push_back(repeated_node);
+      }
+      for (auto out_node : node->outputs) {
+        if (out_node->IsCtrlVar()) {
+          continue;
+        }
+        ir::Node* var = nullptr;
+        auto updated_var = UpdateGradVarDesc(out_node->Var(), i, grad_names,
+                                             bn_vars_need_rename);
+        if (copied.find(out_node) == copied.end()) {
+          var = result.CreateVarNode(&updated_var);
+          if (grad_names.find(out_node->Var()->Name()) != grad_names.end()) {
+            grad_repeated_map[out_node].push_back(var);
+          }
+          copied.insert(out_node);
+          created[updated_var.Name()].push_back(var);
+        } else {
+          var = created.at(updated_var.Name()).back();
+        }
+        repeated_node->outputs.push_back(var);
+        var->inputs.push_back(repeated_node);
+      }
+    }
+  }
+
+  // 5. create GRAD merge op node
+  for (auto kv : grad_repeated_map) {
+    OpDesc sum_op;
+    sum_op.SetType("sum");
+    std::vector<std::string> repeated_grad_names;
+    for (auto r : kv.second) {
+      repeated_grad_names.push_back(r->Var()->Name());
+    }
+    sum_op.SetInput("X", repeated_grad_names);
+    sum_op.SetOutput("Out", {kv.first->Var()->Name()});
+    sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+                   static_cast<int>(OpRole::kBackward));
+    auto sum_op_node = result.CreateOpNode(&sum_op);
+    for (auto r : kv.second) {
+      sum_op_node->inputs.push_back(r);
+      r->outputs.push_back(sum_op_node);
+    }
+    auto sum_out_var_node = result.CreateVarNode(kv.first->Var());
+    sum_op_node->outputs.push_back(sum_out_var_node);
+    sum_out_var_node->inputs.push_back(sum_op_node);
+    created[sum_out_var_node->Name()].push_back(sum_out_var_node);
+
+    OpDesc scale_op;
+    scale_op.SetType("scale");
+    scale_op.SetInput("X", {sum_out_var_node->Var()->Name()});
+    // NOTE: inplace scale.
+    scale_op.SetOutput("Out", {sum_out_var_node->Var()->Name()});
+    scale_op.SetAttr("scale", static_cast<float>(1.0f / num_repeats));
+    scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+                     static_cast<int>(OpRole::kBackward));
+    auto scale_op_node = result.CreateOpNode(&scale_op);
+    scale_op_node->inputs.push_back(sum_out_var_node);
+    sum_out_var_node->outputs.push_back(scale_op_node);
+    auto scale_out_var_node = result.CreateVarNode(sum_out_var_node->Var());
+    scale_op_node->outputs.push_back(scale_out_var_node);
+    scale_out_var_node->inputs.push_back(scale_op_node);
+    created[scale_out_var_node->Name()].push_back(scale_out_var_node);
+  }
+  // 6. add optimize ops
+  {
+    auto copy_node = [&result, &created](ir::Node* node) {
+      auto op_node = result.CreateOpNode(node->Op());
+      // copy op ins/outs
+      // NOTE: for send/recv ops, the OpDesc uses ctrldepvar to describe
+      // dependencies, so create those depvars if OpDesc have in/outs.
+      for (auto in_node : node->inputs) {
+        if (in_node->IsCtrlVar() && !in_node->Var()) {
+          continue;
+        }
+        ir::Node* var = nullptr;
+        if (created.find(in_node->Name()) == created.end()) {
+          var = result.CreateVarNode(in_node->Var());
+          created[in_node->Name()].push_back(var);
+        } else {
+          var = created.at(in_node->Name()).back();
+        }
+        op_node->inputs.push_back(var);
+        var->outputs.push_back(op_node);
+      }
+      for (auto out_node : node->outputs) {
+        if (out_node->IsCtrlVar() && !out_node->Var()) {
+          continue;
+        }
+        auto var = result.CreateVarNode(out_node->Var());
+        created[out_node->Name()].push_back(var);
+        op_node->outputs.push_back(var);
+        var->inputs.push_back(op_node);
+      }
+    };
+    for (auto node : lr_ops) {
+      copy_node(node);
+    }
+    for (auto node : optimize_ops) {
+      copy_node(node);
+    }
+  }
+
+  result.ResolveHazard(created);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(multi_batch_merge_pass, paddle::framework::ir::BatchMergePass)
+    .RequirePassAttr(paddle::framework::ir::kNumRepeats);
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.h b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
new file mode 100644
index 0000000000..c1e5aef20d
--- /dev/null
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// BatchMergePass is used to copy forward and backward ops for several
+// times to run several batches to simulate large batch size training
+// as if we have more than 1 GPUs.
+// User can define how many batches to run, gradients will be merged
+// through those repeats, and then do optimization using merged gradients.
+// This pass is extremely useful when doing large batch-size distributed
+// sync training, we can simulate even large batch size as if we have more
+// GPUs.
+
+class BatchMergePass : public Pass {
+ public:
+  virtual ~BatchMergePass() {}
+
+ protected:
+  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 5d6da9f1d7..d6d42f5e92 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -44,6 +44,7 @@ class Node {
     return op_desc_.get();
   }
 
+  // Please don't use this API!
   int id() const { return id_; }
 
   bool IsOp() const { return type_ == Type::kOperation; }
@@ -92,6 +93,7 @@ class Node {
   Node() = delete;
 
   static int count_;
+  // Please don't use this API or make this public.
   static void ResetId() { count_ = 0; }
   DISABLE_COPY_AND_ASSIGN(Node);
 };
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 1e7da9a69c..669d08c70c 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor(
     PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
     for (size_t j = 0; j < lod.size(); ++j) {
       auto &sub_lod = new_lod[j];
-      auto &offset = sub_lod.back();
+      size_t offset = sub_lod.back();
       for (size_t k = 1; k < lod[j].size(); ++k) {
         sub_lod.push_back(lod[j][k] + offset);
       }
diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h
index 6d7b6a4ada..36a5c3c5d6 100644
--- a/paddle/fluid/framework/lod_tensor_array.h
+++ b/paddle/fluid/framework/lod_tensor_array.h
@@ -18,6 +18,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
 using LoDTensorArray = std::vector<LoDTensor>;
-}
+
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 77386f4f06..e1aac6dc5a 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -542,6 +542,33 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
     this->reserve(this->size() + size_t(end - begin));
     this->insert(this->end(), begin, end);
   }
+
+  const T *CUDAData(platform::Place place) const {
+    PADDLE_THROW(
+        "Vector::CUDAData() method is not supported in CPU-only version");
+  }
+
+  T *CUDAMutableData(platform::Place place) {
+    PADDLE_THROW(
+        "Vector::CUDAMutableData() method is not supported in CPU-only "
+        "version");
+  }
+
+  const T *Data(platform::Place place) const {
+    PADDLE_ENFORCE(
+        platform::is_cpu_place(place),
+        "Vector::Data() method is not supported when not in CPUPlace");
+    return this->data();
+  }
+
+  T *MutableData(platform::Place place) {
+    PADDLE_ENFORCE(
+        platform::is_cpu_place(place),
+        "Vector::MutableData() method is not supported when not in CPUPlace");
+    return this->data();
+  }
+
+  const void *Handle() const { return static_cast<const void *>(this); }
 };
 
 template <typename T>
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 2840d503f1..7fb42feb95 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -146,22 +146,5 @@ void NaiveExecutor::CleanFeedFetchOps() {
   ops_.swap(ops);
 }
 
-void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) {
-#ifdef PADDLE_WITH_MKLDNN
-  VLOG(3) << "use_mkldnn=True";
-  for (size_t block_id = 0; block_id < program.Size(); ++block_id) {
-    auto *block = const_cast<ProgramDesc &>(program).MutableBlock(block_id);
-    for (auto *op : block->AllOps()) {
-      if (op->HasAttr("use_mkldnn")) {
-        op->SetAttr("use_mkldnn", true);
-      }
-    }
-  }
-#else
-  LOG(WARNING)
-      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
-#endif
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 9374f3f4a3..ddfa6e1f4d 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -48,8 +48,6 @@ class NaiveExecutor {
 
   void CleanFeedFetchOps();
 
-  void EnableMKLDNN(const ProgramDesc& program);
-
  protected:
   void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 121e00b1a3..8ece618f3f 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -419,8 +419,15 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
     }
     VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx());
   }
+
   void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
+
   void operator()(int64_t v) const { attr_->set_l(v); }
+
+  void operator()(const std::vector<int64_t> &v) const {
+    VectorToRepeated(v, attr_->mutable_longs());
+  }
+
   void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };
 
@@ -515,20 +522,14 @@ void OpDesc::InferShape(const BlockDesc &block) const {
 }
 
 void OpDesc::InferVarType(BlockDesc *block) const {
+  // There are a few places that var type can be set.
+  // When VarDesc is created, default set to LOD_TENSOR.
+  // When output variable is created, default is defaut set to LOD_TENSOR.
+  // We limit here to be the only place that operator defines its customized
+  // var type inference. Hence, we don't do any "default" setting here.
   auto &info = OpInfoMap::Instance().Get(this->Type());
   if (info.infer_var_type_) {
     info.infer_var_type_(*this, block);
-  } else {
-    // all output type is LoDTensor by default
-    VLOG(10) << this->Type()
-             << " has not registered InferVarType. Set output variables to "
-                "LOD_TENSOR";
-    for (auto &out_pair : this->outputs_) {
-      for (auto &out_var_name : out_pair.second) {
-        block->FindRecursiveOrCreateVar(out_var_name)
-            .SetType(proto::VarType::LOD_TENSOR);
-      }
-    }
   }
 }
 
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 440e0509be..30c8a26c3d 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -121,10 +121,6 @@ class OpDesc {
 
   BlockDesc *Block() { return this->block_; }
 
-  const BlockDesc &BlockRef() const { return *this->block_; }
-
-  void SetBlock(BlockDesc *block) { this->block_ = block; }
-
  private:
   template <typename MapType>
   static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index df2a7a27ca..ca31303f77 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -21,7 +21,6 @@ namespace framework {
 void OpProtoAndCheckerMaker::Validate() {
   validated_ = true;
   CheckNoDuplicatedInOutAttrs();
-  CheckReuseVars();
 }
 
 OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
@@ -40,40 +39,6 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
   return OpProtoAndCheckerMaker::VariableBuilder{output};
 }
 
-void OpProtoAndCheckerMaker::Reuse(const std::string& name,
-                                   const std::string& reused_name) {
-  bool found = false;
-  proto::OpProto::Var* var;
-
-  for (auto& var : proto_->inputs()) {
-    if (var.name() == reused_name) {
-      found = true;
-      break;
-    }
-  }
-  PADDLE_ENFORCE(found == true,
-                 "Input/Output name: %s reused_name: %s, one of them is not "
-                 "exists or not matched.",
-                 name, reused_name);
-
-  found = false;
-  for (int i = 0; i < proto_->outputs().size(); ++i) {
-    var = proto_->mutable_outputs()->Mutable(i);
-    if (var->name() == name) {
-      PADDLE_ENFORCE(!var->has_reuse(),
-                     "Output(%s) has been set reused var of %s", name,
-                     var->reuse());
-      found = true;
-      var->set_reuse(reused_name);
-      break;
-    }
-  }
-  PADDLE_ENFORCE(found == true,
-                 "Input/Output name: %s reused_name: %s, one of them is not "
-                 "exists or not matched.",
-                 name, reused_name);
-}
-
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   std::unordered_set<std::string> names;
   auto checker = [&](const std::string& name) {
@@ -91,24 +56,6 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   }
 }
 
-void OpProtoAndCheckerMaker::CheckReuseVars() {
-  std::unordered_set<std::string> names;
-  for (auto& input : proto_->inputs()) {
-    names.insert(input.name());
-  }
-  auto checker = [&](const std::string& name, const std::string& reused) {
-    PADDLE_ENFORCE(
-        names.count(reused),
-        "Output [%s] reuse Input [%s], but the input is not registered.", name,
-        reused);
-  };
-  for (auto& output : proto_->outputs()) {
-    if (output.has_reuse()) {
-      checker(output.name(), output.reuse());
-    }
-  }
-}
-
 void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                                         OpAttrChecker* attr_checker) {
   proto_ = proto;
@@ -124,6 +71,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
            static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
            static_cast<int>(OpRole::kLoss) |
                static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kOptimize) |
+               static_cast<int>(OpRole::kLRSched),
            static_cast<int>(OpRole::kNotSpecified)})
       .SetDefault(static_cast<int>(OpRole::kNotSpecified));
   AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 4ed3cc45d6..4c59c73d87 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -14,25 +14,26 @@ limitations under the License. */
 #pragma once
 
 #include <string>
-#include <unordered_set>
-
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
 namespace paddle {
 namespace framework {
 
+//////////////////////////
+// Don't add more roles to make this too complicated!
+//////////////////////////
 enum class OpRole {
   kForward = 0x0000,
   kBackward = 0x0001,
   kOptimize = 0x0002,
   // RPC role is for send/recv releated op
-  kRPC = 0x0003,
+  kRPC = 0x0004,
   // Dist role is for split_byref/split_selected_rows/concat
   // used for distributed training.
-  kDist = 0x0004,
+  kDist = 0x0008,
   // Tag all learning rate scheduler operators.
-  kLRSched = 0x0005,
+  kLRSched = 0x0010,
 
   kLoss = 0x0100,
   // The default value of op's role. This should be only used for unittests and
@@ -73,11 +74,6 @@ class OpProtoAndCheckerMaker {
       var_->set_dispensable(true);
       return *this;
     }
-
-    VariableBuilder &Reuse(const std::string &name) {
-      var_->set_reuse(name);
-      return *this;
-    }
   };
 
   VariableBuilder AddInput(const std::string &name, const std::string &comment);
@@ -85,8 +81,6 @@ class OpProtoAndCheckerMaker {
   VariableBuilder AddOutput(const std::string &name,
                             const std::string &comment);
 
-  void Reuse(const std::string &name, const std::string &reused_name);
-
   template <typename T>
   TypedAttrChecker<T> &AddAttr(const std::string &name,
                                const std::string &comment,
@@ -105,8 +99,6 @@ class OpProtoAndCheckerMaker {
   void CheckNoDuplicatedInOutAttrs();
   void Validate();
 
-  void CheckReuseVars();
-
   proto::OpProto *proto_;
   OpAttrChecker *op_checker_;
   bool validated_{false};
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index b71c7b6468..a8030d377f 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -47,120 +47,3 @@ TEST(ProtoMaker, DuplicatedInOut) {
   ASSERT_THROW(proto_maker(&op_proto, &op_checker),
                paddle::platform::EnforceNotMet);
 }
-
-class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "input of test op");
-    AddOutput("XOut", "output of test op").Reuse("X");
-  }
-};
-
-class TestInplaceProtoMaker2
-    : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "input of test op");
-    AddOutput("XOut", "output of test op").Reuse("X");
-    AddOutput("NoOut", "output of test op").Reuse("NotExists");
-  }
-};
-
-TEST(ProtoMaker, InplaceOutput) {
-  paddle::framework::proto::OpProto op_proto, op_proto2;
-  paddle::framework::OpAttrChecker op_checker;
-  TestInplaceProtoMaker proto_maker;
-  TestInplaceProtoMaker2 proto_maker2;
-
-  proto_maker(&op_proto, &op_checker);
-
-  ASSERT_THROW(proto_maker2(&op_proto2, &op_checker),
-               paddle::platform::EnforceNotMet);
-}
-
-// normal reuse
-class TestReuseProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "input of test op");
-    AddInput("Y", "input of test op");
-    AddOutput("Out", "output of test op");
-    AddOutput("XOut", "output of test op");
-    // avoid destructor exception.
-    // Validate();
-    TestReuse();
-  }
-
-  virtual void TestReuse() {}
-};
-
-// test duplicate reuse error
-class TestReuseProtoMaker2 : public TestReuseProtoMaker {
- public:
-  void TestReuse() {
-    Reuse("Out", "X");
-    Reuse("Out", "Y");
-  }
-};
-
-// NotExists Input
-class TestReuseProtoMaker3 : public TestReuseProtoMaker {
- public:
-  void TestReuse() {
-    Reuse("Out", "NotExists");
-    Reuse("XOut", "X");
-  }
-};
-
-// NotExists Output
-class TestReuseProtoMaker4 : public TestReuseProtoMaker {
- public:
-  void TestReuse() { Reuse("NotExists", "X"); }
-};
-
-TEST(ProtoMaker, Reuse) {
-  paddle::framework::proto::OpProto op_proto;
-  paddle::framework::OpAttrChecker op_checker;
-  TestReuseProtoMaker proto_maker;
-  proto_maker(&op_proto, &op_checker);
-}
-
-// NOTE(dzhwinter):
-// There is a Fatal CHECK on base class destructor, which will call abort inside
-// instead of
-// throw an exception. If we throw an exception in Make(), we will trigger the
-// CHECK and terminate the tests.
-//
-// I had tried to replace the default CHECK with a exception, however, it's
-// still not supported by glog.
-// the details:
-// https://github.com/google/glog/issues/249
-// https://github.com/facebookresearch/TensorComprehensions/issues/351
-/*
-TEST(ProtoMaker, ReuseWithException) {
-  paddle::framework::proto::OpProto op_proto2, op_proto3, op_proto4;
-  paddle::framework::OpAttrChecker op_checker;
-  TestReuseProtoMaker2 proto_maker2;
-  TestReuseProtoMaker3 proto_maker3;
-  TestReuseProtoMaker4 proto_maker4;
-  EXPECT_THROW(proto_maker2(&op_proto2, &op_checker),
-               paddle::platform::EnforceNotMet);
-
-  EXPECT_THROW(proto_maker3(&op_proto3, &op_checker),
-               paddle::platform::EnforceNotMet);
-
-  EXPECT_THROW(proto_maker4(&op_proto4, &op_checker),
-               paddle::platform::EnforceNotMet);
-}
-
-void FailureFunction() {
-  throw std::runtime_error("Check failed in destructor.");
-  // return 0;
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  google::InstallFailureFunction(&FailureFunction);
-  return RUN_ALL_TESTS();
-}
-*/
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 14fcde2fe3..45fc36c706 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -354,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() {
   }
 }
 
-static bool VarIsTensor(const Variable* var) {
-  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
+static bool VarIsTensor(const Variable& var) {
+  return var.IsType<LoDTensor>() || var.IsType<SelectedRows>();
 }
 
-static const Tensor* GetTensorFromVar(Variable* var) {
-  if (var->IsType<LoDTensor>()) {
-    return var->GetMutable<LoDTensor>();
-  } else if (var->IsType<SelectedRows>()) {
-    return var->GetMutable<SelectedRows>()->mutable_value();
+const Tensor* GetTensorFromVar(const Variable& var) {
+  if (var.IsType<LoDTensor>()) {
+    return static_cast<const Tensor*>(&(var.Get<LoDTensor>()));
+  } else if (var.IsType<SelectedRows>()) {
+    return &(var.Get<SelectedRows>().value());
   } else {
     PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 var->Type().name());
+                 var.Type().name());
   }
 }
 
@@ -415,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   auto* var = InputVar(name);
-  return var == nullptr ? nullptr
-                        : GetTensorFromVar(const_cast<Variable*>(var));
+  return var == nullptr ? nullptr : GetTensorFromVar(*var);
 }
 
 template <>
@@ -428,7 +427,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
   std::transform(names.begin(), names.end(), std::back_inserter(res),
                  [&](const std::string& sub_name) {
                    auto var = scope_.FindVar(sub_name);
-                   return var == nullptr ? nullptr : GetTensorFromVar(var);
+                   return var == nullptr ? nullptr : GetTensorFromVar(*var);
                  });
   return res;
 }
@@ -770,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack(
   for (auto& var_name : inplace_vars) {
     VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
     auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
-    auto* transformed_tensor =
-        GetTensorFromVar(transfer_scope.FindVar(var_name));
+    auto* var = transfer_scope.FindVar(var_name);
+    PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr",
+                   var_name);
+    auto* transformed_tensor = GetTensorFromVar(*var);
     original_tensor->ShareDataWith(*transformed_tensor);
   }
 }
@@ -784,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData(
     for (auto& var_name : var_name_item.second) {
       auto* var = scope.FindVar(var_name);
       // Only tensor can be tranfer to another device.
-      if (var == nullptr || !VarIsTensor(var)) {
+      if (var == nullptr || !VarIsTensor(*var)) {
         continue;
       }
 
-      auto* tensor_in = GetTensorFromVar(var);
+      auto* tensor_in = GetTensorFromVar(*var);
       if (!tensor_in->IsInitialized()) {
         continue;
       }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 626b50edfd..96ad320523 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
 }
 
 proto::VarType::Type GetDataTypeOfVar(const Variable* var);
+const Tensor* GetTensorFromVar(const Variable& var);
 
 class OperatorBase;
 class ExecutionContext;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 093108cb54..a45b9ec7a2 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -109,18 +109,9 @@ ParallelExecutor::ParallelExecutor(
   if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
     BCastParamsToDevices(bcast_vars);
   }
-  // Startup Program has been run. All local scopes has correct parameters.
+// Startup Program has been run. All local scopes has correct parameters.
 
-  // Step 2. Create vars in each scope;
-  std::vector<details::VariableInfo> var_infos;
-  for (auto *var : main_program.Block(0).AllVars()) {
-    var_infos.emplace_back();
-    var_infos.back().name_ = var->Name();
-    var_infos.back().type_ = var->GetType();
-    var_infos.back().persistable_ = var->Persistable();
-  }
-
-// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
 #ifdef PADDLE_WITH_CUDA
   std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
@@ -156,13 +147,22 @@ ParallelExecutor::ParallelExecutor(
                            params, member_->local_scopes_, member_->use_cuda_);
 #endif
 
-  if (VLOG_IS_ON(5)) {
-    // If the loss_var_name is given, the number of graph should be only one.
-    if (loss_var_name.size()) {
-      PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
-                        "The number of graph should be only one");
+  // Step 3. Create vars in each scope. Passes may also create new vars.
+  //         skip control vars and empty vars
+  std::vector<details::VariableInfo> var_infos;
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos.emplace_back();
+      var_infos.back().name_ = node->Var()->Name();
+      var_infos.back().type_ = node->Var()->GetType();
+      var_infos.back().persistable_ = node->Var()->Persistable();
     }
   }
+  // If the loss_var_name is given, the number of graph should be only one.
+  if (loss_var_name.size()) {
+    PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
+                      "The number of graph should be only one");
+  }
 
   if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
     member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
@@ -187,6 +187,10 @@ void ParallelExecutor::BCastParamsToDevices(
     }
 
     auto &main_tensor = main_var->Get<LoDTensor>();
+    if (!main_tensor.IsInitialized()) {
+      VLOG(3) << "one in var not inited, return!";
+      continue;
+    }
     auto &dims = main_tensor.dims();
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #ifdef PADDLE_WITH_CUDA
@@ -299,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
 }
 
 ParallelExecutor::~ParallelExecutor() {
-  const auto dev_ctxs =
-      platform::DeviceContextPool::Instance().GetAllDeviceContexts();
-  for (auto &dev_ctx : dev_ctxs) {
-    dev_ctx->Wait();
+  for (auto &p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
   }
 
   if (member_->own_local_scope_) {
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 7e689a37da..48bde2785e 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -103,7 +103,7 @@ TEST(ProgramDesc, copy_ctor) {
       ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
       found_sub_block = true;
 
-      ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size());
+      ASSERT_EQ(2UL, op->GetBlocksAttrIds("sub_blocks").size());
       found_sub_blocks = true;
     }
   }
diff --git a/paddle/fluid/framework/reader_test.cc b/paddle/fluid/framework/reader_test.cc
index 50aca4b5a4..d812417a38 100644
--- a/paddle/fluid/framework/reader_test.cc
+++ b/paddle/fluid/framework/reader_test.cc
@@ -40,7 +40,7 @@ TEST(READER, decorate_chain) {
     auto endpoints = root->GetEndPoints();
     ASSERT_EQ(endpoints.size(), 2U);
     ASSERT_NE(endpoints.count(end_point1.get()), 0UL);
-    ASSERT_NE(endpoints.count(end_point2.get()), 0);
+    ASSERT_NE(endpoints.count(end_point2.get()), 0UL);
   }
 
   {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 14f9f36812..9462620e82 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -78,6 +78,8 @@ class Scope {
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
 
+  std::list<Scope*>& kids() const { return kids_; }
+
   /// Find if a scope exists in the kid scopes
   bool HasKid(const Scope* scope) const;
 
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index cb2061c06a..a0a9a57360 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -75,6 +75,19 @@ TEST(Tensor, MutableData) {
                                         platform::CPUPlace());
     EXPECT_EQ(p1, p2);
   }
+  // Not sure if it's desired, but currently, Tensor type can be changed.
+  {
+    framework::Tensor src_tensor;
+    int8_t* p1 = src_tensor.mutable_data<int8_t>(framework::make_ddim({1}),
+                                                 platform::CPUPlace());
+    EXPECT_NE(p1, nullptr);
+    *p1 = 1;
+
+    uint8_t* p2 = src_tensor.mutable_data<uint8_t>(framework::make_ddim({1}),
+                                                   platform::CPUPlace());
+    EXPECT_NE(p2, nullptr);
+    EXPECT_EQ(static_cast<int>(p2[0]), 1);
+  }
 
 #ifdef PADDLE_WITH_CUDA
   {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 69bcbc0e58..ca1e01c89f 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
+  } else if (platform::is_cuda_pinned_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_pinned_place = boost::get<platform::CUDAPinnedPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size,
+                 nullptr);
   }
 #endif
 }
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 18cdca3a65..a588cb417a 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -25,7 +25,6 @@ DEFINE_int32(dist_threadpool_size, 0,
 
 namespace paddle {
 namespace framework {
-
 std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
 std::once_flag ThreadPool::init_flag_;
 
@@ -47,8 +46,7 @@ void ThreadPool::Init() {
   }
 }
 
-ThreadPool::ThreadPool(int num_threads)
-    : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
+ThreadPool::ThreadPool(int num_threads) : running_(true) {
   threads_.resize(num_threads);
   for (auto& thread : threads_) {
     // TODO(Yancey1989): binding the thread on the specify CPU number
@@ -59,6 +57,7 @@ ThreadPool::ThreadPool(int num_threads)
 ThreadPool::~ThreadPool() {
   {
     // notify all threads to stop running
+    std::lock_guard<std::mutex> l(mutex_);
     running_ = false;
     scheduled_.notify_all();
   }
@@ -69,36 +68,24 @@ ThreadPool::~ThreadPool() {
   }
 }
 
-void ThreadPool::Wait() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  completed_.wait(lock, [=] { return Done() == true; });
-}
-
 void ThreadPool::TaskLoop() {
-  while (running_) {
+  while (true) {
     std::unique_lock<std::mutex> lock(mutex_);
-    scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
 
-    if (!running_) {
-      break;
+    scheduled_.wait(
+        lock, [this] { return !this->tasks_.empty() || !this->running_; });
+
+    if (!running_ || tasks_.empty()) {
+      return;
     }
+
     // pop a task from the task queue
     auto task = std::move(tasks_.front());
     tasks_.pop();
-
-    --idle_threads_;
     lock.unlock();
 
     // run the task
     task();
-
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      ++idle_threads_;
-      if (Done()) {
-        completed_.notify_all();
-      }
-    }
   }
 }
 
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 94111ee335..0687e628aa 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -57,15 +57,6 @@ class ThreadPool {
 
   ~ThreadPool();
 
-  // Returns the number of threads created by the constructor.
-  size_t Threads() const { return total_threads_; }
-
-  // Returns the number of currently idle threads.
-  size_t IdleThreads() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    return idle_threads_;
-  }
-
   // Run pushes a function to the task queue and returns a std::future
   // object.  To wait for the completion of the task, call
   // std::future::wait().
@@ -94,25 +85,13 @@ class ThreadPool {
     });
     std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
     tasks_.push(std::move(task));
-    lock.unlock();
     scheduled_.notify_one();
     return f;
   }
 
-  // Wait until all the tasks are completed.
-  void Wait();
-
  private:
   DISABLE_COPY_AND_ASSIGN(ThreadPool);
 
-  // If the task queue is empty and avaialbe is equal to the number of
-  // threads, means that all tasks are completed.  Note: this function
-  // is not thread-safe.  Returns true if all tasks are completed.
-  // Note: don't delete the data member total_threads_ and use
-  // threads_.size() instead; because you'd need to lock the mutex
-  // before accessing threads_.
-  bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
-
   // The constructor starts threads to run TaskLoop, which retrieves
   // and runs tasks from the queue.
   void TaskLoop();
@@ -125,14 +104,11 @@ class ThreadPool {
   static std::once_flag init_flag_;
 
   std::vector<std::unique_ptr<std::thread>> threads_;
-  const size_t total_threads_;
-  size_t idle_threads_;
 
   std::queue<Task> tasks_;
   std::mutex mutex_;
   bool running_;
   std::condition_variable scheduled_;
-  std::condition_variable completed_;
 };
 
 class ThreadPoolIO : ThreadPool {
diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc
index 27a4ffd4fc..884d61e234 100644
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
@@ -19,10 +19,11 @@ limitations under the License. */
 
 namespace framework = paddle::framework;
 
-void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) {
-  std::vector<std::future<void>> fs;
+void do_sum(std::vector<std::future<void>>* fs, std::mutex* mu,
+            std::atomic<int>* sum, int cnt) {
   for (int i = 0; i < cnt; ++i) {
-    fs.push_back(framework::Async([sum]() { sum->fetch_add(1); }));
+    std::lock_guard<std::mutex> l(*mu);
+    fs->push_back(framework::Async([sum]() { sum->fetch_add(1); }));
   }
 }
 
@@ -40,18 +41,21 @@ TEST(ThreadPool, ConcurrentInit) {
 }
 
 TEST(ThreadPool, ConcurrentRun) {
-  framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
   std::atomic<int> sum(0);
   std::vector<std::thread> threads;
+  std::vector<std::future<void>> fs;
+  std::mutex fs_mu;
   int n = 50;
   // sum = (n * (n + 1)) / 2
   for (int i = 1; i <= n; ++i) {
-    std::thread t(do_sum, pool, &sum, i);
+    std::thread t(do_sum, &fs, &fs_mu, &sum, i);
     threads.push_back(std::move(t));
   }
   for (auto& t : threads) {
     t.join();
   }
-  pool->Wait();
+  for (auto& t : fs) {
+    t.wait();
+  }
   EXPECT_EQ(sum, ((n + 1) * n) / 2);
 }
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index e099e40f12..2de6233a9e 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -36,7 +36,7 @@ using Attribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                    std::vector<float>, std::vector<std::string>, bool,
                    std::vector<bool>, BlockDesc*, int64_t,
-                   std::vector<BlockDesc*>>;
+                   std::vector<BlockDesc*>, std::vector<int64_t>>;
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 9794a193bc..d31c8e3b7d 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,3 +1,6 @@
+if(WITH_TESTING)
+  include(test.cmake) # some generic cmake funtion for inference
+endif()
 # analysis and tensorrt must be added before creating static library,
 # otherwise, there would be undefined reference to them in static library.
 add_subdirectory(analysis)
@@ -30,7 +33,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor)
+cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
@@ -40,7 +43,7 @@ endif()
 
 # Create shared library
 cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
-    DEPS ${fluid_modules} paddle_fluid_api)
+    DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index d4d2fd4634..0354f9e6e9 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid)
 
-function (inference_analysis_test TARGET)
-    if(WITH_TESTING)
-        set(options "")
-        set(oneValueArgs "")
-        set(multiValueArgs SRCS ARGS EXTRA_DEPS)
-        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-        set(mem_opt "")
-        if(WITH_GPU)
-            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
-        endif()
-        cc_test(${TARGET}
-                SRCS "${analysis_test_SRCS}"
-                DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
-        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
-    endif(WITH_TESTING)
+function(inference_analysis_test TARGET)
+  if(WITH_TESTING)
+     set(options "")
+     set(oneValueArgs "")
+     set(multiValueArgs SRCS ARGS EXTRA_DEPS)
+     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+     inference_base_test(${TARGET}
+             SRCS ${analysis_test_SRCS}
+             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
+             ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS})
+  endif()
 endfunction(inference_analysis_test)
 
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 2e79d495d5..ef4142f334 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -107,6 +107,9 @@ void Analyzer::Run(Argument* argument) {
     passes.push_back("mkldnn_placement_pass");
   }
 #endif
+  // infer_clean_graph_pass should be the first default pass
+  // after mkldnn_placement_pass.
+  passes.push_back("infer_clean_graph_pass");
   for (auto& pass : ir_passes_) {
     if (!disabled_ir_passes_.count(pass)) {
       passes.push_back(pass);
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index c51a4fdb2f..3af1d572df 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -67,7 +67,6 @@ class Analyzer : public OrderedRegistry<PassManager> {
   // larger fusion.
   const std::vector<std::string> all_ir_passes_{{
       // Manual update the passes here.
-      "infer_clean_graph_pass",         //
       "attention_lstm_fuse_pass",       //
       "seqconv_eltadd_relu_fuse_pass",  //
       "embedding_fc_lstm_fuse_pass",    //
@@ -80,6 +79,7 @@ class Analyzer : public OrderedRegistry<PassManager> {
       "conv_bn_fuse_pass",              //
       "conv_eltwiseadd_bn_fuse_pass",   //
 #ifdef PADDLE_WITH_MKLDNN
+      "depthwise_conv_mkldnn_pass",             //
       "conv_bias_mkldnn_fuse_pass",             //
       "conv_relu_mkldnn_fuse_pass",             //
       "conv_elementwise_add_mkldnn_fuse_pass",  //
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 1682011c3d..50ce20621f 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
@@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type,
   op->SetType(type);
   op->SetInput("Xs", inputs);
   op->SetOutput("Xs", outputs);
+  op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
+              static_cast<int>(framework::OpRole::kForward));
 }
 
 TEST(DataFlowGraph, Build_IR_Graph) {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 0ddd5d53f8..49a9ebe3dd 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -17,32 +17,14 @@ if(APPLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
 
-
 set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB})
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor)
 endif()
 
-function(inference_api_test TARGET_NAME)
-    if (WITH_TESTING)
-        set(options "")
-        set(oneValueArgs SRC)
-        set(multiValueArgs ARGS)
-        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-        cc_test(${TARGET_NAME}
-                SRCS ${inference_test_SRC}
-                DEPS "${inference_deps}"
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-        if(inference_test_ARGS)
-            set_tests_properties(${TARGET_NAME}
-                    PROPERTIES DEPENDS "${inference_test_ARGS}")
-        endif()
-    endif(WITH_TESTING)
-endfunction(inference_api_test)
-
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope)
+cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
 cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
@@ -50,10 +32,11 @@ cc_test(test_paddle_inference_api
         SRCS api_tester.cc
         DEPS paddle_inference_api)
 
-inference_api_test(test_api_impl SRC api_impl_tester.cc
-                    ARGS test_word2vec test_image_classification)
-
-set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+if(WITH_TESTING)
+  inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
+                      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
+  set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
+endif()
 cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api
         ARGS --dirname=${PYTHON_TESTS_DIR}/book)
 
@@ -61,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
         SRCS api_tensorrt_subgraph_engine.cc
         DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy)
-
-inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
+  if(WITH_TESTING)
+    inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps}
+                      ARGS --dirname=${WORD2VEC_MODEL_DIR})
+  endif()
 endif()
 
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index eec6657671..54c37fe645 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -82,6 +82,7 @@ bool AnalysisPredictor::Init(
 
   // Get the feed_target_names and fetch_target_names
   PrepareFeedFetch();
+
   return true;
 }
 
@@ -109,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
+
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
 
@@ -322,6 +327,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 
 bool AnalysisPredictor::ZeroCopyRun() {
   executor_->Run();
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 5a9f4d3695..b7dc206733 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
 
@@ -88,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor {
   // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
   // concurrency problems, so cache them.
   std::vector<framework::LoDTensor> feed_tensors_;
+  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 7cda9c5d8a..d06ab8f8c8 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -157,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
+
+  // Fix TensorArray reuse not cleaned bug.
+  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
 
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 7882f6a53c..4e4ab47ca9 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -26,11 +26,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
@@ -77,6 +77,7 @@ class NativePaddlePredictor : public PaddlePredictor {
   std::vector<framework::OpDesc *> fetchs_;
   // Do not use unique_ptr, use parent scope to delete
   framework::Scope *sub_scope_{nullptr};
+  details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index b7b8ee6ea0..5152b8670d 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -27,7 +27,9 @@ limitations under the License. */
 #define ACC_DIFF 1e-3
 #endif
 
-DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_string(word2vec_dirname, "",
+              "Directory of the word2vec inference model.");
+DEFINE_string(book_dirname, "", "Directory of the book inference model.");
 
 namespace paddle {
 
@@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
 
 NativeConfig GetConfig() {
   NativeConfig config;
-  config.model_dir = FLAGS_dirname + "/word2vec.inference.model";
+  config.model_dir = FLAGS_word2vec_dirname;
   LOG(INFO) << "dirname  " << config.model_dir;
   config.fraction_of_gpu_memory = 0.15;
 #ifdef PADDLE_WITH_CUDA
@@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) {
   NativeConfig config = GetConfig();
   config.use_gpu = use_gpu;
   config.model_dir =
-      FLAGS_dirname + "/image_classification_resnet.inference.model";
+      FLAGS_book_dirname + "/image_classification_resnet.inference.model";
 
   const bool is_combined = false;
   std::vector<std::vector<int64_t>> feed_target_shapes =
@@ -187,7 +189,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
   std::vector<std::thread> threads;
   for (int tid = 0; tid < num_jobs; ++tid) {
     threads.emplace_back([&, tid]() {
-      auto predictor = main_predictor->Clone();
+      auto predictor = CreatePaddlePredictor(config);
       auto& local_inputs = paddle_tensor_feeds[tid];
       std::vector<PaddleTensor> local_outputs;
       ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
@@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) {
   NativeConfig config = GetConfig();
   config.use_gpu = use_gpu;
   config.model_dir =
-      FLAGS_dirname + "/image_classification_resnet.inference.model";
+      FLAGS_book_dirname + "/image_classification_resnet.inference.model";
 
   auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
   std::vector<framework::LoDTensor> jobs(num_jobs);
@@ -245,7 +247,7 @@ void MainThreadsImageClassification(bool use_gpu) {
   std::vector<std::thread> threads;
   for (int tid = 0; tid < num_jobs; ++tid) {
     threads.emplace_back([&, tid]() {
-      auto predictor = main_predictor->Clone();
+      auto predictor = CreatePaddlePredictor(config);
       auto& local_inputs = paddle_tensor_feeds[tid];
       std::vector<PaddleTensor> local_outputs;
       ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
@@ -271,7 +273,7 @@ TEST(inference_api_native, word2vec_cpu_threads) {
   MainThreadsWord2Vec(false /*use_gpu*/);
 }
 TEST(inference_api_native, image_classification_cpu) {
-  MainThreadsImageClassification(false /*use_gpu*/);
+  MainImageClassification(false /*use_gpu*/);
 }
 TEST(inference_api_native, image_classification_cpu_threads) {
   MainThreadsImageClassification(false /*use_gpu*/);
@@ -279,15 +281,17 @@ TEST(inference_api_native, image_classification_cpu_threads) {
 
 #ifdef PADDLE_WITH_CUDA
 TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
-TEST(inference_api_native, word2vec_gpu_threads) {
-  MainThreadsWord2Vec(true /*use_gpu*/);
-}
+// Turn off temporarily for the unstable result.
+// TEST(inference_api_native, word2vec_gpu_threads) {
+//   MainThreadsWord2Vec(true /*use_gpu*/);
+// }
 TEST(inference_api_native, image_classification_gpu) {
-  MainThreadsImageClassification(true /*use_gpu*/);
-}
-TEST(inference_api_native, image_classification_gpu_threads) {
-  MainThreadsImageClassification(true /*use_gpu*/);
+  MainImageClassification(true /*use_gpu*/);
 }
+// Turn off temporarily for the unstable result.
+// TEST(inference_api_native, image_classification_gpu_threads) {
+//   MainThreadsImageClassification(true /*use_gpu*/);
+// }
 
 #endif
 
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
index 702158ea3b..89c9a65cb0 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
 
   //# 1. Create PaddlePredictor with a config.
   NativeConfig config0;
-  config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config0.model_dir = FLAGS_dirname;
   config0.use_gpu = true;
   config0.fraction_of_gpu_memory = 0.3;
   config0.device = 0;
 
   MixedRTConfig config1;
-  config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config1.model_dir = FLAGS_dirname;
   config1.use_gpu = true;
   config1.fraction_of_gpu_memory = 0.3;
   config1.device = 0;
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 03f0f726eb..49683eab07 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -52,6 +52,7 @@ include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
 if (NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
@@ -61,8 +62,8 @@ endif(NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
 
-if (NOT WIN32) 
-  if (USE_TENSORRT AND WITH_GPU) 
+if (NOT WIN32)
+  if (USE_TENSORRT AND WITH_GPU)
       include_directories("${TENSORRT_INCLUDE_DIR}")
       link_directories("${TENSORRT_LIB_DIR}")
   endif()
@@ -77,13 +78,14 @@ endif(NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
 
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} 
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
                ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
   set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
   if(EXISTS ${MKLDNN_PATH})
@@ -107,7 +109,7 @@ if (NOT WIN32)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
     ${MATH_LIB} ${MKLDNN_LIB}
-    glog gflags protobuf snappystream snappy z
+    glog gflags protobuf snappystream snappy z xxhash
     ${EXTERNAL_LIB})
 else()
 set(DEPS ${DEPS}
@@ -120,7 +122,7 @@ endif(NOT WIN32)
 
 if(WITH_GPU)
   if(NOT WIN32)
-    if (USE_TENSORRT) 
+    if (USE_TENSORRT)
       set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
       set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
     endif()
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 67994aad70..ff718077c1 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -16,12 +16,12 @@ if [ $2 == ON ]; then
 fi
 if [ $3 == ON ]; then
   use_gpu_list='true false'
-else    
+else
   use_gpu_list='false'
 fi
 
 USE_TENSORRT=OFF
-if [ [-d"$TENSORRT_INCLUDE_DIR"] -a [-d"$TENSORRT_LIB_DIR"] ]; then
+if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
   USE_TENSORRT=ON
 fi
 
@@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do
     -DWITH_GPU=$TEST_GPU_CPU \
     -DWITH_STATIC_LIB=$WITH_STATIC_LIB
   make -j
-  word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model'
+  word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
   if [ -d $word2vec_model ]; then
     for use_gpu in $use_gpu_list; do
       ./simple_on_word2vec \
@@ -83,7 +83,7 @@ for WITH_STATIC_LIB in ON OFF; do
     -DWITH_STATIC_LIB=$WITH_STATIC_LIB
   make -j
   for use_gpu in $use_gpu_list; do
-    for vis_demo_name in $vis_demo_list; do 
+    for vis_demo_name in $vis_demo_list; do
       ./vis_demo \
         --modeldir=$DATA_DIR/$vis_demo_name/model \
         --data=$DATA_DIR/$vis_demo_name/data.txt \
@@ -95,7 +95,7 @@ for WITH_STATIC_LIB in ON OFF; do
       fi
     done
   done
-  
+
   # --------tensorrt mobilenet------
   if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
     rm -rf *
@@ -107,7 +107,7 @@ for WITH_STATIC_LIB in ON OFF; do
       -DUSE_TENSORRT=$USE_TENSORRT \
       -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
       -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
-    make -j 
+    make -j
     ./trt_mobilenet_demo \
       --modeldir=$DATA_DIR/mobilenet/model \
       --data=$DATA_DIR/mobilenet/data.txt \
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
new file mode 100644
index 0000000000..4ae6c6dc9f
--- /dev/null
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+
+namespace paddle {
+namespace details {
+
+// Should be called after the parameters are loaded.
+void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
+  if (flag_) {
+    for (auto &var_name : scope->LocalVarNames()) {
+      auto *var = scope->FindVar(var_name);
+      // TODO(Superjomn) should avoid the case when a TensorArray is a
+      // parameter.
+      if (var_name == "feed" || var_name == "fetch") continue;
+      if (var->Type() == typeid(framework::LoDTensorArray)) {
+        VLOG(4) << "collect " << var_name;
+        arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
+      }
+    }
+    for (auto *kid : scope->kids()) {
+      CollectTensorArrays(kid);
+    }
+
+    VLOG(3) << "Collect " << arrays_.size() << " arrays";
+    flag_ = false;
+  }
+}
+
+// Should be called when `Run` finished.
+void TensorArrayBatchCleaner::ResetTensorArray() {
+  for (auto *arr : arrays_) {
+    arr->clear();
+  }
+}
+
+}  // namespace details
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
new file mode 100644
index 0000000000..a39449ff0e
--- /dev/null
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace details {
+
+// Clean the TensorArray each batch to make the behavior the same with the
+// training phase.
+struct TensorArrayBatchCleaner {
+  // Fix the tensor array not clear in the inference scenarios.
+  void CollectTensorArrays(framework::Scope *scope);
+  void ResetTensorArray();
+
+ private:
+  bool flag_{true};
+  std::vector<framework::LoDTensorArray *> arrays_;
+};
+
+}  // namespace details
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 24f59cf43a..e46dc13269 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -160,7 +160,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double latency, int epoch = 1) {
   LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
             << ", threads: " << num_threads << ", thread id: " << tid
-            << ", latency: " << latency << "ms ======";
+            << ", latency: " << latency << "ms, fps: " << 1 / (latency / 1000.f)
+            << " ======";
   if (epoch > 1) {
     int samples = batch_size * epoch;
     LOG(INFO) << "====== sample number: " << samples
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 07ee6e72d1..a755ccb93b 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -124,7 +124,7 @@ class ZeroCopyTensor {
   std::vector<std::vector<size_t>> lod() const;
 
  protected:
-  ZeroCopyTensor(void* scope) : scope_{scope} {}
+  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
   void SetName(const std::string& name) { name_ = name; }
   void* FindTensor() const;
 
@@ -259,12 +259,6 @@ struct AnalysisConfig : public NativeConfig {
     kExclude   // Specify the disabled passes in `ir_passes`.
   };
 
-  void SetIncludeMode() {
-    ir_mode = IrPassMode::kInclude;
-    // this pass has to be run at the beginning of all fuse passes
-    ir_passes = {"infer_clean_graph_pass"};
-  }
-
   // Determine whether to perform graph optimization.
   bool enable_ir_optim = true;
   // Manually determine the IR passes to run.
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index f9bb66a6e9..677f85152f 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -42,16 +42,22 @@ class Pool2dOpConverter : public OpConverter {
         boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
     std::vector<int> paddings =
         boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+    bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));
 
+    nvinfer1::Dims input_shape = input1->getDimensions();
+    int nbDims = input_shape.nbDims;
     nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
     if (global_pooling == true) {
-      nvinfer1::Dims input_shape = input1->getDimensions();
-      int nbDims = input_shape.nbDims;
       nv_ksize.d[0] = input_shape.d[nbDims - 2];
       nv_ksize.d[1] = input_shape.d[nbDims - 1];
+      nv_strides.h() = 1;
+      nv_strides.w() = 1;
+      nv_paddings.h() = 0;
+      nv_paddings.w() = 0;
     }
-    const nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
-    const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
 
     PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
 
@@ -64,6 +70,36 @@ class Pool2dOpConverter : public OpConverter {
       PADDLE_THROW("TensorRT unsupported pooling type!");
     }
 
+    if (ceil_mode) {
+      nvinfer1::DimsHW pre_pad(0, 0);
+      nvinfer1::DimsHW post_pad(0, 0);
+      int input_height = input_shape.d[nbDims - 2];
+      int input_width = input_shape.d[nbDims - 1];
+      int floor_h_output_size =
+          (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+      int ceil_h_output_size =
+          (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
+              strides[0] +
+          1;
+
+      int floor_w_output_size =
+          (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+      int ceil_w_output_size =
+          (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
+              strides[1] +
+          1;
+      if (floor_h_output_size != ceil_h_output_size) {
+        post_pad.h() = strides[0] - 1;
+      }
+
+      if (floor_w_output_size != ceil_w_output_size) {
+        post_pad.w() = strides[1] - 1;
+      }
+      auto* layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Padding, *const_cast<nvinfer1::ITensor*>(input1), pre_pad,
+          post_pad);
+      input1 = layer->getOutput(0);
+    }
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
                                        *const_cast<nvinfer1::ITensor*>(input1),
                                        nv_pool_type, nv_ksize);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index aedd6b62df..ee597f8465 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -20,18 +20,20 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void test_pool2d(bool global_pooling) {
+void test_pool2d(bool global_pooling, bool ceil_mode) {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
   TRTConvertValidation validator(5, parameters, scope, 1 << 15);
 
   // The ITensor's Dims should not contain the batch size.
   // So, the ITensor's Dims of input and output should be C * H * W.
-  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4));
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 13, 14));
   if (global_pooling)
     validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
+  else if (ceil_mode)
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 7));
   else
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2));
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 6));
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -39,7 +41,7 @@ void test_pool2d(bool global_pooling) {
   desc.SetInput("X", {"pool2d-X"});
   desc.SetOutput("Out", {"pool2d-Out"});
 
-  std::vector<int> ksize({2, 2});
+  std::vector<int> ksize({3, 3});
   std::vector<int> strides({2, 2});
   std::vector<int> paddings({0, 0});
   std::string pooling_t = "max";
@@ -49,6 +51,7 @@ void test_pool2d(bool global_pooling) {
   desc.SetAttr("strides", strides);
   desc.SetAttr("paddings", paddings);
   desc.SetAttr("global_pooling", global_pooling);
+  desc.SetAttr("ceil_mode", ceil_mode);
 
   LOG(INFO) << "set OP";
   validator.SetOp(*desc.Proto());
@@ -57,9 +60,10 @@ void test_pool2d(bool global_pooling) {
   validator.Execute(3);
 }
 
-TEST(Pool2dOpConverter, normal) { test_pool2d(false); }
+TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
+TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }
 
-TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true); }
+TEST(Pool2dOpConverter, test_ceil_mode) { test_pool2d(false, true); }
 
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/test.cmake b/paddle/fluid/inference/test.cmake
new file mode 100644
index 0000000000..ab3a30ce6b
--- /dev/null
+++ b/paddle/fluid/inference/test.cmake
@@ -0,0 +1,31 @@
+set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
+set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
+    "A path setting inference demo download directories.")
+function (inference_download install_dir url filename)
+    message(STATUS "Download inference test stuff from ${url}/${filename}")
+    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
+    message(STATUS "finish downloading ${filename}")
+endfunction()
+
+function (inference_download_and_uncompress install_dir url filename)
+    inference_download(${install_dir} ${url} ${filename})
+    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+endfunction()
+
+set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
+if (NOT EXISTS ${WORD2VEC_INSTALL_DIR})
+    inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
+endif()
+set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
+
+function (inference_base_test TARGET)
+   set(options "")
+   set(oneValueArgs "")
+   set(multiValueArgs SRCS ARGS DEPS)
+   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+   if(WITH_GPU)
+       set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
+   endif()
+   cc_test(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS} ARGS ${mem_opt} ${base_test_ARGS})
+endfunction()
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index c3dd1f4336..71fdc67068 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -1,18 +1,4 @@
-set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com")
-set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
-    "A path setting inference demo download directories.")
 set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
-function (inference_download install_dir url filename)
-    message(STATUS "Download inference test stuff from ${url}/${filename}")
-    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
-    message(STATUS "finish downloading ${filename}")
-endfunction()
-
-function (inference_download_and_uncompress install_dir url filename)
-    inference_download(${install_dir} ${url} ${filename})
-    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
-endfunction()
 
 function(download_model_and_data install_dir model_name data_name)
     if (NOT EXISTS ${install_dir})
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 6766829844..c2151eea08 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -71,7 +71,7 @@ void profile(bool use_mkldnn = false) {
 }
 
 TEST(Analyzer_resnet50, profile) { profile(); }
-#ifndef PADDLE_WITH_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
 TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); }
 #endif
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 6399476680..e0416ff953 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -228,6 +228,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_rnn1, profile) {
   contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg.use_gpu = false;
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index b1ee108003..19c3f532d5 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -50,7 +50,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
     auto &ref_out = ref_outputs[i];
     size_t size = VecReduceToInt(out.shape);
     size_t ref_size = VecReduceToInt(ref_out.shape);
-    EXPECT_GT(size, 0);
+    EXPECT_GT(size, 0UL);
     EXPECT_EQ(size, ref_size);
     EXPECT_EQ(out.dtype, ref_out.dtype);
     switch (out.dtype) {
@@ -139,6 +139,9 @@ void TestMultiThreadPrediction(
   }
   for (int tid = 0; tid < num_threads; ++tid) {
     threads.emplace_back([&, tid]() {
+#ifdef PADDLE_WITH_MKLDNN
+      platform::set_cur_thread_id(static_cast<int>(tid) + 1);
+#endif
       // Each thread should have local inputs and outputs.
       // The inputs of each thread are all the same.
       std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 6c95f4b9c5..919ad96f7a 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
+op_library(hash_op DEPS xxhash)
 op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
@@ -284,10 +285,10 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 if (NOT WIN32)
-op_library(lstm_op DEPS sequence2batch lstm_compute)
-op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
-op_library(lstmp_op DEPS sequence2batch lstm_compute)
-op_library(gru_op DEPS sequence2batch gru_compute)
+    op_library(lstm_op DEPS sequence2batch lstm_compute)
+    op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
+    op_library(lstmp_op DEPS sequence2batch lstm_compute)
+    op_library(gru_op DEPS sequence2batch gru_compute)
 endif(NOT WIN32)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
@@ -300,6 +301,7 @@ op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
 op_library(fake_quantize_op DEPS memory)
+op_library(crf_decoding_op DEPS jit_kernel)
 op_library(fusion_lstm_op DEPS jit_kernel)
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
@@ -316,7 +318,7 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
-op_library(concat_op DEPS concat)
+op_library(concat_op DEPS concat_and_split)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 
@@ -348,6 +350,6 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 if(NOT WIN32)
-nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+    nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index bbf52bea13..9ddb3a5d29 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -28,7 +28,7 @@ using paddle::framework::Tensor;
    public:                                                              \
     void Make() override {                                              \
       AddInput("X", "Input of " #OP_NAME " operator");                  \
-      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X");   \
+      AddOutput("Out", "Output of " #OP_NAME " operator");              \
       AddAttr<bool>("use_mkldnn",                                       \
                     "(bool, default false) Only used in mkldnn kernel") \
           .SetDefault(false);                                           \
diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
index 5d670fe3b9..f3717af630 100644
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -92,9 +92,9 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
     AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
 
-    AddOutput("ParamOut", "(Tensor) Output parameter").Reuse("Param");
-    AddOutput("Moment1Out", "(Tensor) Output first moment").Reuse("Moment1");
-    AddOutput("Moment2Out", "(Tensor) Output second moment").Reuse("Moment2");
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
 
     AddAttr<float>("beta1",
                    "(float, default 0.9) "
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
new file mode 100644
index 0000000000..8127e554be
--- /dev/null
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/add_position_encoding_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AddPositionEncodingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of add_position_encoding_op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Out(Output) of add_position_encoding_op should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Out@GRAD must not be null.");
+
+    auto out_dims = ctx->GetInputDim("Out");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
+    }
+  }
+};
+
+class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of AddPositionEncoding operator");
+    AddOutput("Out", "Output of AddPositionEncoding operator");
+    AddAttr<float>("alpha", "The scale of Original Embedding.")
+        .SetDefault(1.0f)
+        .AddCustomChecker([](const float& alpha) {
+          PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
+        });
+    AddAttr<float>("beta", "The scale of Position Embedding.")
+        .SetDefault(1.0f)
+        .AddCustomChecker([](const float& beta) {
+          PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
+        });
+    AddComment(R"DOC(
+    Add Position Encoding Operator.
+    
+    The add position encoding calculates the output based on the input, alpha, beta.
+    The size of each dimension of the parameters checked in the infer-shape.
+  )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plt = paddle::platform;
+
+REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
+                  ops::AddPositionEncodingOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    add_position_encoding,
+    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, float>,
+    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    add_position_encoding_grad,
+    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, float>,
+    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/add_position_encoding_op.h b/paddle/fluid/operators/add_position_encoding_op.h
new file mode 100644
index 0000000000..5f371235f1
--- /dev/null
+++ b/paddle/fluid/operators/add_position_encoding_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AddPositionEncodingKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::LoDTensor>("X");
+    auto& x_lod = X->lod();
+    auto* src_ptr = X->data<T>();
+
+    auto* Out = context.Output<framework::LoDTensor>("Out");
+    auto* dst_ptr = Out->mutable_data<T>(context.GetPlace());
+
+    float alpha = context.Attr<float>("alpha");
+    float beta = context.Attr<float>("beta");
+
+    auto x_dim = X->dims();
+    int batch_size = 0;
+    int max_seq_len = 0;
+    int enc_size = 0;
+
+    if (x_lod.empty()) {
+      PADDLE_ENFORCE(
+          x_dim.size() == 3UL,
+          "The input X of Add Position Encoding should be 3-D Tensor!");
+      batch_size = x_dim[0];
+      max_seq_len = x_dim[1];
+      enc_size = x_dim[2];
+    } else {
+      PADDLE_ENFORCE(
+          x_dim.size() == 2UL,
+          "The input X of Add Position Encoding should be 2-D LoDTensor!");
+      PADDLE_ENFORCE(
+          x_lod.size() == 1UL,
+          "The Add Position Encoding Op only supports lod_level == 1!");
+      batch_size = x_lod[0].size() - 1;
+      max_seq_len = -1;
+      enc_size = x_dim[1];
+    }
+
+    PADDLE_ENFORCE(enc_size % 2 == 0, "Only support even encode size!");
+
+    const int half_size = enc_size / 2;
+    for (int i = 0; i < batch_size; ++i) {
+      const int max_length =
+          x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
+      for (int j = 0; j < max_length; ++j) {
+        for (int k = 0; k < half_size; ++k) {
+          const double val = (half_size > 1)
+                                 ? j / pow(10000.0, double(k) / (half_size - 1))
+                                 : j / 10000.0;
+          dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
+          dst_ptr[half_size + k] =
+              src_ptr[half_size + k] * alpha + cos(val) * beta;
+        }
+        src_ptr += enc_size;
+        dst_ptr += enc_size;
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AddPositionEncodingGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dOut =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+
+    auto* dX =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+
+    float alpha = context.Attr<float>("alpha");
+
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    dx.device(*place) = dout * static_cast<T>(alpha);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
new file mode 100644
index 0000000000..ed71594ba5
--- /dev/null
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedSpatialTransformerDescriptor =
+    platform::ScopedSpatialTransformerDescriptor;
+
+template <typename T>
+class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto* theta = ctx.Input<Tensor>("Theta");
+    auto* output = ctx.Output<Tensor>("Output");
+    const T* theta_data = theta->data<T>();
+
+    int n = theta->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    Tensor h_sizes;
+    int* h_size_data;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      h_size_data = h_sizes.data<int>();
+    } else {
+      h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
+      h_size_data[0] = n;
+      h_size_data[1] = size_attr[1];
+      h_size_data[2] = size_attr[2];
+      h_size_data[3] = size_attr[3];
+    }
+
+    T* output_data = output->mutable_data<T>(
+        {n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace());
+    ScopedSpatialTransformerDescriptor st_desc;
+    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
+        st_desc.descriptor<T>(4, h_size_data);
+
+    PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward(
+        handle, cudnn_st_desc, theta_data, output_data));
+  }
+};
+
+template <typename T>
+class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
+
+    int n = output_grad->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    Tensor h_sizes;
+    int* h_size_data;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      h_size_data = h_sizes.data<int>();
+    } else {
+      h_size_data = h_sizes.mutable_data<int>({4}, platform::CPUPlace());
+      h_size_data[0] = n;
+      h_size_data[1] = size_attr[1];
+      h_size_data[2] = size_attr[2];
+      h_size_data[3] = size_attr[3];
+    }
+
+    ScopedSpatialTransformerDescriptor st_desc;
+    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
+        st_desc.descriptor<T>(4, h_size_data);
+
+    const T* output_grad_data = output_grad->data<T>();
+    T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
+
+    PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward(
+        handle, cudnn_st_desc, output_grad_data, theta_grad_data));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNAffineGridOpKernel<float>,
+                   paddle::operators::CUDNNAffineGridOpKernel<double>);
+REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNAffineGridGradOpKernel<float>,
+                   paddle::operators::CUDNNAffineGridGradOpKernel<double>);
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
new file mode 100644
index 0000000000..0ea28265a2
--- /dev/null
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -0,0 +1,233 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/affine_grid_op.h"
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct Linspace<paddle::platform::CPUDeviceContext, T> {
+  framework::Tensor operator()(T start, T end, int count,
+                               const framework::ExecutionContext& ctx) {
+    Tensor numbers;
+    T* number_data = numbers.mutable_data<T>({count}, platform::CPUPlace());
+    T slice = (end - start) / (T)(count - 1);
+    for (int i = 0; i < count; ++i) {
+      number_data[i] = start + (T)i * slice;
+    }
+    return numbers;
+  }
+};
+
+class AffineGridOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Theta"),
+                   "Input(Theta) of AffineGridOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of AffineGridOp should not be null.");
+    auto theta_dims = ctx->GetInputDim("Theta");
+    PADDLE_ENFORCE(theta_dims.size() == 3,
+                   "AffineGrid's Input(Theta) should be 3-D tensor.");
+
+    auto output_shape = ctx->Attrs().Get<std::vector<int>>("output_shape");
+    if (output_shape.size() == 0) {
+      PADDLE_ENFORCE(ctx->HasInput("OutputShape"),
+                     "Input(OutputShape) of AffineGridOp should not be null if "
+                     "attr(output_shape) is not configured.");
+      auto output_shape_dims = ctx->GetInputDim("OutputShape");
+      PADDLE_ENFORCE(output_shape_dims.size() == 1,
+                     "AffineGrid's Input(OutputShape) should be 1-D tensor.");
+    } else {
+      PADDLE_ENFORCE(output_shape.size() == 4,
+                     "The size of attr(output_shape) should be 4.");
+    }
+
+    PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2.");
+    PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3.");
+    // N * H * W * 2
+    ctx->SetOutputDim("Output",
+                      framework::make_ddim({theta_dims[0], -1, -1, 2}));
+    ctx->ShareLoD("Theta", "Output");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kCUDNN;
+    }
+#endif
+    auto data_type = framework::ToDataType(ctx.Input<Tensor>("Theta")->type());
+    return framework::OpKernelType(data_type, ctx.GetPlace(),
+                                   framework::DataLayout::kAnyLayout, library);
+  }
+};
+
+class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Theta",
+        "(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. "
+        "It is used to transform coordinate (x_0, y_0) to coordinate (x_1, "
+        "y_1).");
+    AddInput("OutputShape",
+             "(Tensor) The shape of target image with format [N, C, H, W].")
+        .AsDispensable();
+    AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2].");
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn")
+        .SetDefault(true);
+    AddAttr<std::vector<int>>(
+        "output_shape",
+        "The target output image shape with format [N, C, H, W].")
+        .SetDefault(std::vector<int>());
+
+    AddComment(R"DOC(
+    It generates a grid of (x,y) coordinates using the parameters of the
+    affine transformation that correspond to a set of points where the input
+    feature map should be sampled to produce the transformed output feature map.
+
+    Given:
+        Theta = [[[x_11, x_12, x_13]
+                  [x_14, x_15, x_16]]
+                 [[x_21, x_22, x_23]
+                  [x_24, x_25, x_26]]]
+    
+        OutputShape = [2, 3, 5, 5]
+
+    Step 1:
+
+        Generate relative coordinates according to OutputShape.
+        The values of relative coordinates are in the interval between -1 and 1.
+        The shape of the relative coordinates is [2, H, W] as below:
+    
+        C = [[[-1.  -1.  -1.  -1.  -1. ]
+              [-0.5 -0.5 -0.5 -0.5 -0.5]
+              [ 0.   0.   0.   0.   0. ]
+              [ 0.5  0.5  0.5  0.5  0.5]
+              [ 1.   1.   1.   1.   1. ]] 
+             [[-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]
+              [-1.  -0.5  0.   0.5  1. ]]]
+        C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
+    
+    Step2:
+        Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
+        C_ = [[-1.  -1.   1. ]
+              [-0.5 -1.   1. ]
+              [ 0.  -1.   1. ]
+              [ 0.5 -1.   1. ]
+              [ 1.  -1.   1. ]
+              [-1.  -0.5  1. ]
+              [-0.5 -0.5  1. ]
+              [ 0.  -0.5  1. ]
+              [ 0.5 -0.5  1. ]
+              [ 1.  -0.5  1. ]
+              [-1.   0.   1. ]
+              [-0.5  0.   1. ]
+              [ 0.   0.   1. ]
+              [ 0.5  0.   1. ]
+              [ 1.   0.   1. ]
+              [-1.   0.5  1. ]
+              [-0.5  0.5  1. ]
+              [ 0.   0.5  1. ]
+              [ 0.5  0.5  1. ]
+              [ 1.   0.5  1. ]
+              [-1.   1.   1. ]
+              [-0.5  1.   1. ]
+              [ 0.   1.   1. ]
+              [ 0.5  1.   1. ]
+              [ 1.   1.   1. ]]
+    Step3:
+        Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
+    )DOC");
+  }
+};
+
+class AffineGridOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto theta_dims = ctx->GetInputDim("Theta");
+    if (ctx->HasOutput(framework::GradVarName("Theta"))) {
+      ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Theta")->type()),
+        ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_);
+  }
+};
+
+class AffineGridGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("affine_grid_grad");
+    op->SetInput("Theta", Input("Theta"));
+    op->SetInput("OutputShape", Input("OutputShape"));
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker,
+                  ops::AffineGridGradMaker);
+REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    affine_grid,
+    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    affine_grid_grad,
+    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
new file mode 100644
index 0000000000..07e26c292c
--- /dev/null
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -0,0 +1,190 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+using Array1 = Eigen::DSizes<int64_t, 1>;
+using Array2 = Eigen::DSizes<int64_t, 2>;
+using Array3 = Eigen::DSizes<int64_t, 3>;
+using Array4 = Eigen::DSizes<int64_t, 4>;
+
+/**
+ *Return a tensor with evenly spaced numbers over a specified interval.
+ */
+template <typename DeviceContext, typename T>
+struct Linspace {
+  framework::Tensor operator()(T start, T end, int count,
+                               const framework::ExecutionContext& ctx);
+};
+
+template <typename DeviceContext, typename T>
+class AffineGridOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto* theta = ctx.Input<Tensor>("Theta");
+    int n = theta->dims()[0];
+
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+
+    auto* output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), output,
+        static_cast<T>(0));
+
+    Linspace<DeviceContext, T> linspace;
+    // Get indexes of height with shape [height, width, 1]
+    auto h_idx = linspace((T)-1, (T)1, h, ctx);
+    auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
+    // Get indexes of width with shape [height, width, 1]
+    auto w_idx = linspace((T)-1, (T)1, w, ctx);
+    auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
+    // Get constant ones tensor with shape [height, width, 1]
+    Tensor ones;
+    ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
+    auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
+    // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
+    // ones
+    Tensor grid;
+    grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
+    auto grid_t = EigenTensor<T, 4>::From(grid);
+
+    grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
+                               .broadcast(Array2(h, 1))
+                               .reshape(Array3(h, w, 1))
+                               .concatenate(h_idx_t.reshape(Array2(1, h))
+                                                .broadcast(Array2(w, 1))
+                                                .shuffle(Array2(1, 0))
+                                                .reshape(Array3(h, w, 1)),
+                                            2)
+                               .eval()
+                               .concatenate(ones_t, 2)
+                               .reshape(Array4(1, h, w, 3))
+                               .broadcast(Array4(n, 1, 1, 1));
+
+    // output = grid * theta.T
+    // TODO(wanghaoshuang): Refine batched matrix multiply
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    for (int i = 0; i < n; ++i) {
+      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
+      Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
+      Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2});
+      blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
+                  T(0));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AffineGridGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
+
+    int n = output_grad->dims()[0];
+    auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
+    int h = 0;
+    int w = 0;
+    if (size_attr.size() == 0) {
+      auto* output_shape = ctx.Input<Tensor>("OutputShape");
+      Tensor h_sizes;
+      framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
+      const int* h_size_data = h_sizes.data<int>();
+      h = h_size_data[2];
+      w = h_size_data[3];
+    } else {
+      h = size_attr[2];
+      w = size_attr[3];
+    }
+
+    theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
+
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), theta_grad,
+        static_cast<T>(0));
+
+    Linspace<DeviceContext, T> linspace;
+
+    // Get indexes of height with shape [height, width, 1]
+    auto h_idx = linspace((T)-1, (T)1, h, ctx);
+    auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
+    // Get indexes of width with shape [height, width, 1]
+    auto w_idx = linspace((T)-1, (T)1, w, ctx);
+    auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
+    // Get constant ones tensor with shape [height, width, 1]
+    Tensor ones;
+    ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
+    auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
+    // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
+    // ones
+    Tensor grid;
+    grid.mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
+    auto grid_t = EigenTensor<T, 4>::From(grid);
+    grid_t.device(place) = w_idx_t.reshape(Array2(1, w))
+                               .broadcast(Array2(h, 1))
+                               .reshape(Array3(h, w, 1))
+                               .concatenate(h_idx_t.reshape(Array2(1, h))
+                                                .broadcast(Array2(w, 1))
+                                                .shuffle(Array2(1, 0))
+                                                .reshape(Array3(h, w, 1)),
+                                            2)
+                               .eval()
+                               .concatenate(ones_t, 2)
+                               .reshape(Array4(1, h, w, 3))
+                               .broadcast(Array4(n, 1, 1, 1));
+    // output = grid * theta.T
+    // TODO(wanghaoshuang): Refine batched matrix multiply
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    for (int i = 0; i < n; ++i) {
+      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
+      Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2});
+      Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
+      blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
+                  &sliced_theta_grad, T(0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index b8b8b2290a..6257e04b01 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <paddle/fluid/operators/math/concat.h>
+#include <paddle/fluid/operators/math/concat_and_split.h>
 #include <numeric>
 
 #include "paddle/fluid/framework/lod_rank_table.h"
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 5912a1a17c..3eb4738325 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -135,15 +135,13 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Variance",
              "The global variance (for training) "
              "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization").Reuse("X");
+    AddOutput("Y", "result after normalization");
     AddOutput("MeanOut",
               "Share memory with Mean. "
-              "Store the global mean when training")
-        .Reuse("Mean");
+              "Store the global mean when training");
     AddOutput("VarianceOut",
               "Share memory with Variance. "
-              "Store the global Variance when training")
-        .Reuse("Variance");
+              "Store the global Variance when training");
     AddOutput("SavedMean",
               "Mean of the current mini batch, "
               "will apply to output when training")
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index b6cb935814..0d32cae0e1 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -79,6 +79,9 @@ struct BeamSearchDecodeFunctor {
   bool tensor_on_gpu_;
   size_t beam_size_;
   int end_id_;
+  // TODO(Superjomn) Here might result serious performance issue in the
+  // concurrency
+  // scenarios.
   const LoDTensorArray& step_ids_origin_;
   const LoDTensorArray& step_scores_origin_;
   LoDTensorArray step_ids_ = LoDTensorArray();
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index b2c6495c44..bd474be0fa 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
@@ -89,29 +89,17 @@ class ConcatGradKernel : public framework::OpKernel<T> {
         outputs.push_back(nullptr);
       }
     }
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     // Sometimes direct copies will be faster, this maybe need deeply analysis.
     if (axis == 0 && outs.size() < 10) {
-      size_t input_offset = 0;
-      const auto in_stride = framework::stride_numel(out_grad->dims());
-
-      for (size_t i = 0; i < outs.size(); ++i) {
-        auto out_stride = framework::stride_numel(ins[i]->dims());
-        auto* out = outputs[i];
-        if (out != nullptr) {
-          StridedNumelCopyWithAxis<T>(
-              ctx.device_context(), axis, out->data<T>(), out_stride,
-              out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
-        }
-        input_offset += out_stride[axis];
-      }
+      std::vector<const framework::Tensor*> ref_shape;
+      ref_shape.insert(ref_shape.begin(), ins.begin(), ins.end());
+      StridedMemcpyWithAxis0<T>(dev_ctx, *out_grad, ref_shape, &outputs);
     } else {
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
-          concat_grad_functor;
-      concat_grad_functor(dev_ctx, *out_grad,
-                          ctx.MultiInput<framework::Tensor>("X"),
-                          static_cast<int>(axis), &outputs);
+      math::SplitFunctor<DeviceContext, T> split_functor;
+      split_functor(dev_ctx, *out_grad, ctx.MultiInput<framework::Tensor>("X"),
+                    static_cast<int>(axis), &outputs);
     }
   }
 };
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 8f2561fcc3..2cd9979bd3 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -130,8 +130,7 @@ void Conv2DOpMaker::Make() {
       .AsDispensable();
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator. "
-            "The format of output tensor is also NCHW.")
-      .Reuse("Input");
+            "The format of output tensor is also NCHW.");
   AddInput("ResidualData",
            "(Tensor) Tensor with residual data "
            "to which convolution output will be added."
@@ -238,8 +237,7 @@ void Conv3DOpMaker::Make() {
            "input image channels divided by the groups.");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator."
-            "The format of output tensor is also NCDHW.")
-      .Reuse("Input");
+            "The format of output tensor is also NCDHW.");
   AddAttr<std::vector<int>>("strides",
                             "(vector<int>, default:{1, 1, 1}), the "
                             "strides(d_stride, h_stride, w_stride) of "
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 8181897c3d..e9d2e84a43 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -69,9 +70,6 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     auto emission_dims = emission_weights.dims();
     const size_t seq_len = emission_dims[0];
     const size_t tag_num = emission_dims[1];
-
-    const size_t state_trans_base_idx = 2;
-
     const T* x = emission_weights.data<T>();
     const T* w = transition_weights.data<T>();
     int64_t* path = decoded_path->data<int64_t>();
@@ -84,221 +82,10 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     Tensor track;
     int* track_value =
         track.mutable_data<int>(emission_dims, platform::CPUPlace());
-
-#ifdef __AVX__
-// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
-// 16 elements per iteration. Then it can implement the parallel processing.
-// Only optimize for float type.
-#ifdef __AVX512F__
-    size_t step_size = 16;
-#else
-    size_t step_size = 8;
-#endif
-    if (std::is_same<T, float>::value && (tag_num >= step_size)) {
-      size_t steps = tag_num / step_size;
-      size_t remain = tag_num % step_size;
-      int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
-
-      // Setup the alpha initial value.
-      size_t i_offset = 0;
-      for (size_t i = 0; i <= steps; ++i) {
-#ifdef __AVX512F__
-        // Declare the variable for the content of weights, input and alpha
-        // values.
-        __m512 w_content, x_content, alpha_content;
-
-        // Load the relevant data into the variables from un-aligned address.
-        w_content = _mm512_loadu_ps((const float*)(w + i_offset));
-        x_content = _mm512_loadu_ps((const float*)(x + i_offset));
-        alpha_content = _mm512_add_ps(w_content, x_content);
-
-        // Save the alpha value.
-        _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
-                         alpha_content);
-#else
-        // Declare the variable for the content of weights, input and alpha
-        // values.
-        __m256 w_content, x_content, alpha_content;
-
-        // Load the relevant data into the variables from un-aligned address.
-        w_content = _mm256_loadu_ps((const float*)(w + i_offset));
-        x_content = _mm256_loadu_ps((const float*)(x + i_offset));
-        alpha_content = _mm256_add_ps(w_content, x_content);
-
-        // Save the alpha value.
-        _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
-                         alpha_content);
-#endif
-        i_offset += step_size;
-        if (i == steps - 1) {
-          if (remain > 0) {
-            i_offset += last_offset;
-          } else {
-            break;
-          }
-        }
-      }
-
-      // Use the column-major strategy to get the location of maximum score.
-      size_t seq_offset = 0;
-      for (size_t k = 1; k < seq_len; ++k) {
-        size_t j_offset = 0;
-        for (size_t j = 0; j <= steps; ++j) {
-#ifdef __AVX512F__
-          // Initialize the variables of maximum score and location.
-          __m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
-          __m512i max_j = _mm512_setzero_si512();
-#else
-          // Initialize the variables of maximum score and location.
-          __m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
-          __m256i max_j = _mm256_set1_epi32(0);
-#endif
-          // Calculate the offset of transition_weights.
-          size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
-          for (size_t i = 0; i < tag_num; ++i) {
-#ifdef __AVX512F__
-            // Initalize the content of alpha variable with related offset.
-            __m512 alpha_content =
-                _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
-            // Obtain the content of weights from un-aligned address.
-            __m512 w_content =
-                _mm512_loadu_ps((const float*)(w + trans_offset));
-
-            __m512 score_v = _mm512_add_ps(alpha_content, w_content);
-
-            __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
-
-            // According to the mask value, it update the index of the max_score
-            // location.
-            max_j = _mm512_mask_set1_epi32(max_j, mask, i);
-
-            // Update the max_score value.
-            max_score = _mm512_max_ps(max_score, score_v);
-#else
-            // Initalize the content of alpha variable with related offset.
-            __m256 alpha_content = _mm256_broadcast_ss(
-                (const float*)(alpha_value + seq_offset + i));
-            // Obtain the content of weights from un-aligned address.
-            __m256 w_content =
-                _mm256_loadu_ps((const float*)(w + trans_offset));
-            __m256 score_v = _mm256_add_ps(alpha_content, w_content);
-
-            __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
-
-#ifdef __AVX2__
-            // According to the mask value, it update the index of the max_score
-            // location.
-            max_j = _mm256_or_si256(
-                _mm256_andnot_si256((__m256i)mask, max_j),
-                _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
-#else
-            __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
-            __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
-            __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
-            __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
-
-            lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
-            hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
-            lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
-            hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
-
-            lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
-            hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
-
-            // According to the mask value, it update the index of the max_score
-            // location.
-            max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
-            max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
-#endif
-
-            // Update the max_score value.
-            max_score = _mm256_max_ps(max_score, score_v);
-#endif
-            trans_offset += tag_num;
-          }
-
-#ifdef __AVX512F__
-          // Update the alpha and track values.
-          __m512 x_content = _mm512_loadu_ps(
-              (const float*)(x + seq_offset + tag_num + j_offset));
-          max_score = _mm512_add_ps(max_score, x_content);
-          _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
-                                                    tag_num + j_offset),
-                           max_score);
-          _mm512_storeu_si512(
-              reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
-                                         j_offset),
-              max_j);
-#else
-          // Update the alpha and track values.
-          __m256 x_content = _mm256_loadu_ps(
-              (const float*)(x + seq_offset + tag_num + j_offset));
-          max_score = _mm256_add_ps(max_score, x_content);
-          _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
-                                                    tag_num + j_offset),
-                           max_score);
-          _mm256_storeu_si256(
-              reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
-                                         j_offset),
-              max_j);
-#endif
-
-          // Calculate the offset of next step
-          j_offset += step_size;
-          if (j == steps - 1) {
-            if (remain > 0) {
-              j_offset += last_offset;
-            } else {
-              break;
-            }
-          }
-        }
-
-        seq_offset += tag_num;
-      }
-    } else {
-      for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
-
-      for (size_t k = 1; k < seq_len; ++k) {
-        for (size_t i = 0; i < tag_num; ++i) {
-          T max_score = -std::numeric_limits<T>::max();
-          int max_j = 0;
-          for (size_t j = 0; j < tag_num; ++j) {
-            T score = alpha_value[(k - 1) * tag_num + j] +
-                      w[(j + state_trans_base_idx) * tag_num + i];
-            if (score > max_score) {
-              max_score = score;
-              max_j = j;
-            }
-          }
-
-          alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
-          track_value[k * tag_num + i] = max_j;
-        }
-      }
-    }
-#else
-    for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
-
-    for (size_t k = 1; k < seq_len; ++k) {
-      for (size_t i = 0; i < tag_num; ++i) {
-        T max_score = -std::numeric_limits<T>::max();
-        int max_j = 0;
-        for (size_t j = 0; j < tag_num; ++j) {
-          T score = alpha_value[(k - 1) * tag_num + j] +
-                    w[(j + state_trans_base_idx) * tag_num + i];
-          if (score > max_score) {
-            max_score = score;
-            max_j = j;
-          }
-        }
-
-        alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
-        track_value[k * tag_num + i] = max_j;
-      }
-    }
-
-#endif
+    const auto& ker = math::jitkernel::KernelPool::Instance()
+                          .template Get<math::jitkernel::CRFDecodeKernel<T>>(
+                              static_cast<int>(tag_num));
+    ker->Compute(static_cast<int>(seq_len), x, w, alpha_value, track_value);
     T max_score = -std::numeric_limits<T>::max();
     int max_i = 0;
     for (size_t i = 0; i < tag_num; ++i) {
diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc
index d7a9bfbc43..89416f7ab5 100644
--- a/paddle/fluid/operators/delete_var_op.cc
+++ b/paddle/fluid/operators/delete_var_op.cc
@@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase {
   }
 };
 
+class DeleteVarOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
+
 class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -48,4 +53,5 @@ It should not be configured by users directly.
 
 REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp,
                   paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::DeleteVarOpInfoMaker);
+                  paddle::operators::DeleteVarOpInfoMaker,
+                  paddle::operators::DeleteVarOpShapeInference);
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index d7a53f1bef..fddd688401 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -439,31 +439,88 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
 class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    // TODO(buxingyuan): Add Document
-    AddInput("RpnRois", "RpnRois.");
-    AddInput("GtClasses", "GtClasses.");
-    AddInput("IsCrowd", "IsCrowd.");
-    AddInput("GtBoxes", "GtBoxes.");
-    AddInput("ImInfo", "ImInfo.");
-
-    AddOutput("Rois", "Rois.");
-    AddOutput("LabelsInt32", "LabelsInt32.");
-    AddOutput("BboxTargets", "BboxTargets.");
-    AddOutput("BboxInsideWeights", "BboxInsideWeights.");
-    AddOutput("BboxOutsideWeights", "BboxOutsideWeights.");
-
-    AddAttr<int>("batch_size_per_im", "batch_size_per_im");
-    AddAttr<float>("fg_fraction", "fg_fraction");
-    AddAttr<float>("fg_thresh", "fg_thresh");
-    AddAttr<float>("bg_thresh_hi", "bg_thresh_hi");
-    AddAttr<float>("bg_thresh_lo", "bg_thresh_lo");
-    AddAttr<std::vector<float>>("bbox_reg_weights", "bbox_reg_weights");
-    AddAttr<int>("class_nums", "class_nums");
-    AddAttr<bool>("use_random", "use_random").SetDefault(true);
+    AddInput(
+        "RpnRois",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [N, 4]. "
+        "N is the number of the GenerateProposalOp's output, "
+        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
+    AddInput("GtClasses",
+             "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+             "M is the number of groundtruth, "
+             "each element is a class label of groundtruth.");
+    AddInput(
+        "IsCrowd",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+        "M is the number of groundtruth, "
+        "each element is a flag indicates whether a groundtruth is crowd.");
+    AddInput(
+        "GtBoxes",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 4]. "
+        "M is the number of groundtruth, "
+        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
+    AddInput("ImInfo",
+             "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
+             "B is the number of input images, "
+             "each element consists of im_height, im_width, im_scale.");
+
+    AddOutput(
+        "Rois",
+        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
+        "P usuall equal to  batch_size_per_im * batch_size, "
+        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
+    AddOutput("LabelsInt32",
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P], "
+              "each element repersents a class label of a roi");
+    AddOutput("BboxTargets",
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
+              "class_nums], "
+              "each element repersents a box label of a roi");
+    AddOutput(
+        "BboxInsideWeights",
+        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
+        "class_nums], "
+        "each element indicates whether a box should contribute to loss.");
+    AddOutput(
+        "BboxOutsideWeights",
+        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
+        "class_nums], "
+        "each element indicates whether a box should contribute to loss.");
+
+    AddAttr<int>("batch_size_per_im", "Batch size of rois per images.");
+    AddAttr<float>("fg_fraction",
+                   "Foreground fraction in total batch_size_per_im.");
+    AddAttr<float>(
+        "fg_thresh",
+        "Overlap threshold which is used to chose foreground sample.");
+    AddAttr<float>("bg_thresh_hi",
+                   "Overlap threshold upper bound which is used to chose "
+                   "background sample.");
+    AddAttr<float>("bg_thresh_lo",
+                   "Overlap threshold lower bound which is used to chose "
+                   "background sample.");
+    AddAttr<std::vector<float>>("bbox_reg_weights", "Box regression weights.");
+    AddAttr<int>("class_nums", "Class number.");
+    AddAttr<bool>(
+        "use_random",
+        "Use random sampling to choose foreground and background boxes.")
+        .SetDefault(true);
 
     AddComment(R"DOC(
-Generate Proposals Labels Operator.
-)DOC");
+This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
+to sample foreground boxes and background boxes, and compute loss target.
+
+RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes
+were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction,
+If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample.
+If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi,
+then it was considered as a background sample.
+After all foreground and background boxes are chosen (so called Rois),
+then we apply random sampling to make sure
+the number of foreground boxes is no more than batch_size_per_im * fg_fraction.
+
+For each box in Rois, we assign the classification (class label) and regression targets (box label) to it.
+Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss.
+    )DOC");
   }
 };
 
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index a69d9c9a52..709c2dfc4b 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -284,7 +284,7 @@ static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox,
       selected_indices.push_back(idx);
       ++selected_num;
     }
-    sorted_indices.erase(sorted_indices.end());
+    sorted_indices.erase(sorted_indices.end() - 1);
     if (flag && eta < 1 && adaptive_threshold > 0.5) {
       adaptive_threshold *= eta;
     }
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index dda423efd3..46fff9d338 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -52,6 +52,9 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(
         ctx->HasOutput("TargetBBox"),
         "Output(TargetBBox) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("BBoxInsideWeight"),
+        "Output(BBoxInsideWeight) of RpnTargetAssignOp should not be null");
 
     auto anchor_dims = ctx->GetInputDim("Anchor");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
@@ -68,6 +71,7 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ScoreIndex", {-1});
     ctx->SetOutputDim("TargetLabel", {-1, 1});
     ctx->SetOutputDim("TargetBBox", {-1, 4});
+    ctx->SetOutputDim("BBoxInsideWeight", {-1, 4});
   }
 
  protected:
@@ -169,6 +173,7 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
                  const float rpn_positive_overlap,
                  const float rpn_negative_overlap, std::vector<int>* fg_inds,
                  std::vector<int>* bg_inds, std::vector<int>* tgt_lbl,
+                 std::vector<int>* fg_fake, std::vector<T>* bbox_inside_weight,
                  std::minstd_rand engine, bool use_random) {
   float epsilon = 0.00001;
   int anchor_num = anchor_to_gt_max.dims()[0];
@@ -201,12 +206,12 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
   // Reservoir Sampling
   int fg_num = static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);
   ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
-  fg_num = static_cast<int>(fg_inds_fake.size());
-  for (int64_t i = 0; i < fg_num; ++i) {
+  int fg_fake_num = static_cast<int>(fg_inds_fake.size());
+  for (int64_t i = 0; i < fg_fake_num; ++i) {
     target_label[fg_inds_fake[i]] = 1;
   }
 
-  int bg_num = rpn_batch_size_per_im - fg_num;
+  int bg_num = rpn_batch_size_per_im - fg_fake_num;
   for (int64_t i = 0; i < anchor_num; ++i) {
     if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
       bg_inds_fake.push_back(i);
@@ -214,12 +219,28 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
   }
   ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
   bg_num = static_cast<int>(bg_inds_fake.size());
+  int fake_num = 0;
   for (int64_t i = 0; i < bg_num; ++i) {
+    // fg fake found
+    if (target_label[bg_inds_fake[i]] == 1) {
+      fake_num++;
+      fg_fake->emplace_back(fg_inds_fake[0]);
+      for (int j = 0; j < 4; ++j) {
+        bbox_inside_weight->emplace_back(T(0.));
+      }
+    }
     target_label[bg_inds_fake[i]] = 0;
   }
 
+  for (int64_t i = 0; i < (fg_fake_num - fake_num) * 4; ++i) {
+    bbox_inside_weight->emplace_back(T(1.));
+  }
+
   for (int64_t i = 0; i < anchor_num; ++i) {
-    if (target_label[i] == 1) fg_inds->emplace_back(i);
+    if (target_label[i] == 1) {
+      fg_inds->emplace_back(i);
+      fg_fake->emplace_back(i);
+    }
     if (target_label[i] == 0) bg_inds->emplace_back(i);
   }
   fg_num = fg_inds->size();
@@ -248,7 +269,8 @@ std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
   std::vector<int> bg_inds;
   std::vector<int> gt_inds;
   std::vector<int> tgt_lbl;
-
+  std::vector<int> fg_fake;
+  std::vector<T> bbox_inside_weight;
   // Calculate the max IoU between anchors and gt boxes
   // Map from anchor to gt box that has highest overlap
   auto place = ctx.GetPlace();
@@ -275,32 +297,37 @@ std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
   // Follow the Faster RCNN's implementation
   ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max,
               rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap,
-              rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine,
-              use_random);
+              rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, &fg_fake,
+              &bbox_inside_weight, engine, use_random);
 
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
-  gt_inds.reserve(fg_num);
-  for (int i = 0; i < fg_num; ++i) {
-    gt_inds.emplace_back(argmax[fg_inds[i]]);
+  int fg_fake_num = fg_fake.size();
+  gt_inds.reserve(fg_fake_num);
+  for (int i = 0; i < fg_fake_num; ++i) {
+    gt_inds.emplace_back(argmax[fg_fake[i]]);
   }
-
-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t;
-  int* loc_index_data = loc_index_t.mutable_data<int>({fg_num}, place);
+  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
+  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
   int* score_index_data =
       score_index_t.mutable_data<int>({fg_num + bg_num}, place);
   int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_num}, place);
-  std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data);
+  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_fake_num}, place);
+  T* bbox_inside_weight_data =
+      bbox_inside_weight_t.mutable_data<T>({fg_fake_num, 4}, place);
+  std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data);
   std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
   std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
   std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
   std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
+  std::copy(bbox_inside_weight.begin(), bbox_inside_weight.end(),
+            bbox_inside_weight_data);
   std::vector<Tensor> loc_score_tgtlbl_gt;
   loc_score_tgtlbl_gt.emplace_back(loc_index_t);
   loc_score_tgtlbl_gt.emplace_back(score_index_t);
   loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
   loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
+  loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t);
 
   return loc_score_tgtlbl_gt;
 }
@@ -318,6 +345,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
     auto* score_index = context.Output<LoDTensor>("ScoreIndex");
     auto* tgt_bbox = context.Output<LoDTensor>("TargetBBox");
     auto* tgt_lbl = context.Output<LoDTensor>("TargetLabel");
+    auto* bbox_inside_weight = context.Output<LoDTensor>("BBoxInsideWeight");
 
     PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
                       "RpnTargetAssignOp gt_boxes needs 1 level of LoD");
@@ -340,7 +368,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
     score_index->mutable_data<int>({max_num}, place);
     tgt_bbox->mutable_data<T>({max_num, 4}, place);
     tgt_lbl->mutable_data<int>({max_num, 1}, place);
-
+    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
     auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
 
     std::random_device rnd;
@@ -394,6 +422,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
       Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
       Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
       Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
 
       int loc_num = sampled_loc_index.dims()[0];
       int score_num = sampled_score_index.dims()[0];
@@ -432,6 +461,8 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
       AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
       AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
       AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
+      AppendRpns<T>(bbox_inside_weight, total_loc_num * 4,
+                    &sampled_bbox_inside_weight);
       total_loc_num += loc_num;
 
       total_score_num += score_num;
@@ -448,10 +479,12 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
     score_index->set_lod(loc_score);
     tgt_bbox->set_lod(lod_loc);
     tgt_lbl->set_lod(loc_score);
+    bbox_inside_weight->set_lod(lod_loc);
     loc_index->Resize({total_loc_num});
     score_index->Resize({total_score_num});
     tgt_bbox->Resize({total_loc_num, 4});
     tgt_lbl->Resize({total_score_num, 1});
+    bbox_inside_weight->Resize({total_loc_num, 4});
   }
 };
 
@@ -514,6 +547,9 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
         "TargetLabel",
         "(Tensor<int>), The target labels of each anchor with shape "
         "[F + B, 1], F and B are sampled foreground and backgroud number.");
+    AddOutput("BBoxInsideWeight",
+              "(Tensor), The bbox inside weight with shape "
+              "[F, 4], F is the sampled foreground number.");
     AddComment(R"DOC(
 This operator can be, for a given set of ground truth bboxes and the
 anchors, to assign classification and regression targets to each prediction.
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 07322e720f..3c28ef3092 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dropout_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -57,6 +58,29 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
                   "will be dropped.")
         .SetDefault(false);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
+    AddAttr<std::string>(
+        "dropout_implementation",
+        "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
+        "There are two kinds of ways to implement dropout"
+        "(the mask below is a tensor have the same shape with input"
+        "the value of mask is 0 or 1, the ratio of 0 is dropout_prob)"
+        "1. downgrade_in_infer(default), downgrade the outcome at inference "
+        "time"
+        "   train: out = input * mask"
+        "   inference: out = input * dropout_prob"
+        "2. upscale_in_train, upscale the outcome at training time, do nothing "
+        "in inference"
+        "   train: out = input * mask / ( 1.0 - dropout_prob )"
+        "   inference: out = input"
+        "   dropout op can be removed from the program. the program will be "
+        "efficient")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string& type) {
+          PADDLE_ENFORCE(
+              type == "downgrade_in_infer" || type == "upscale_in_train",
+              "dropout_implementation can only be downgrade_in_infer or "
+              "upscale_in_train");
+        });
 
     AddComment(R"DOC(
 Dropout Operator.
@@ -104,7 +128,9 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
+    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 1dd66e0280..e011f47e08 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include <string>
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -26,7 +27,8 @@ namespace operators {
 template <typename T>
 __global__ void RandomGenerator(const size_t n, const int seed,
                                 const float dropout_prob, const T* src,
-                                T* mask_data, T* dst) {
+                                T* mask_data, T* dst,
+                                bool is_upscale_in_train) {
   thrust::minstd_rand rng;
   rng.seed(seed);
   thrust::uniform_real_distribution<float> dist(0, 1);
@@ -47,7 +49,11 @@ __global__ void RandomGenerator(const size_t n, const int seed,
     if (dist(rng) < dropout_prob) {
       mask = static_cast<T>(0);
     } else {
-      mask = static_cast<T>(1);
+      if (is_upscale_in_train) {
+        mask = static_cast<T>(1.0f / (1.0f - dropout_prob));
+      } else {
+        mask = static_cast<T>(1);
+      }
     }
     dest = s * mask;
     mask_data[idx] = mask;
@@ -67,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
     y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
+    auto dropout_implementation =
+        context.Attr<std::string>("dropout_implementation");
     auto& place = *context.template device_context<Place>().eigen_device();
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
@@ -83,11 +91,16 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
       int grid = (x->numel() + threads - 1) / threads;
       RandomGenerator<
           T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
-          size, seed, dropout_prob, x_data, mask_data, y_data);
+          size, seed, dropout_prob, x_data, mask_data, y_data,
+          (dropout_implementation == "upscale_in_train"));
     } else {
       auto X = EigenMatrix<T>::Reshape(*x, 1);
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
-      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      if (dropout_implementation == "upscale_in_train") {
+        Y.device(place) = X;
+      } else {
+        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      }
     }
   }
 };
@@ -99,6 +112,8 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
-    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(dropout_grad,
-                        ops::DropoutGradKernel<plat::CUDADeviceContext, float>);
+    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::GPUDropoutKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    dropout_grad, ops::DropoutGradKernel<plat::CUDADeviceContext, float>,
+    ops::DropoutGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 0628b4b826..6c629b7b6d 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <random>
+#include <string>
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -36,6 +37,8 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     auto* y_data = y->mutable_data<T>(context.GetPlace());
     float dropout_prob = context.Attr<float>("dropout_prob");
 
+    auto dropout_implementation =
+        context.Attr<std::string>("dropout_implementation");
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
@@ -49,14 +52,20 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
       engine.seed(seed);
 
       std::uniform_real_distribution<float> dist(0, 1);
+
       size_t size = framework::product(mask->dims());
       for (size_t i = 0; i < size; ++i) {
         if (dist(engine) < dropout_prob) {
           mask_data[i] = 0;
           y_data[i] = 0;
         } else {
-          mask_data[i] = 1;
-          y_data[i] = x_data[i];
+          if (dropout_implementation == "upscale_in_train") {
+            mask_data[i] = 1.0f / static_cast<T>(1.0f - dropout_prob);
+            y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
+          } else {
+            mask_data[i] = 1;
+            y_data[i] = x_data[i];
+          }
         }
       }
     } else {
@@ -64,7 +73,11 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      Y.device(place) = X * (1.0f - dropout_prob);
+      if (dropout_implementation == "upscale_in_train") {
+        Y.device(place) = X;
+      } else {
+        Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index 7e5975ead6..68c6e315cc 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -80,8 +80,6 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() final {
     AddInput("X", "(Tensor), The first input tensor of elementwise op.");
     AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    // AddOutput("SavedShape", "(Tensor), save X, Y shape for grad to save
-    // memory.").AsIntermediate();
     AddOutput("Out", "The output of elementwise op.");
     AddAttr<int>("axis",
                  "(int, default -1). The start dimension index "
@@ -129,13 +127,11 @@ But the output only shares the LoD information with the input $X$.
 
 )DOC",
                                GetName(), GetEquation()));
-    SetReuse();
   }
 
  protected:
   virtual std::string GetName() const = 0;
   virtual std::string GetEquation() const = 0;
-  virtual void SetReuse() {}
 };
 
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
@@ -269,7 +265,6 @@ class ElemwiseGradKernel : public framework::OpKernel<T> {
    protected:                                                          \
     virtual std::string GetName() const { return op_name; }            \
     virtual std::string GetEquation() const { return equation; }       \
-    virtual void SetReuse() { Reuse(__VA_ARGS__); }                    \
   };                                                                   \
   REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,       \
                     __ElemwiseOp##op_type##Maker__,                    \
diff --git a/paddle/fluid/operators/fake_init_op.cc b/paddle/fluid/operators/fake_init_op.cc
new file mode 100644
index 0000000000..28ebdcb03e
--- /dev/null
+++ b/paddle/fluid/operators/fake_init_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class FakeInitInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FakeInitOp should not be null.");
+    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+};
+
+class FakeInitOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    framework::Tensor *tensor = nullptr;
+
+    auto &out_var = *scope.FindVar(Output("Out"));
+
+    if (out_var.IsType<framework::LoDTensor>()) {
+      tensor = out_var.GetMutable<framework::LoDTensor>();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
+    } else if (out_var.IsType<framework::SelectedRows>()) {
+      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
+    } else {
+      PADDLE_THROW(
+          "fake init op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+  }
+};
+
+class FakeInitOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
+class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output");
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddComment(R"DOC(
+FakeInit Operator.
+
+Init an variable but not alloc memory for it, it is used for init the
+table parameter at trainer side in distributed lookup table.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fake_init, ops::FakeInitOp, ops::FakeInitInferShape,
+                  ops::FakeInitOpMaker, paddle::framework::EmptyGradOpMaker,
+                  ops::FakeInitOpVarTypeInference);
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index e04a68717b..252f313440 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -24,7 +24,7 @@ class FillConstantInferShape : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
 };
@@ -47,10 +47,10 @@ class FillConstantOp : public framework::OperatorBase {
 
     if (out_var.IsType<framework::LoDTensor>()) {
       tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else if (out_var.IsType<framework::SelectedRows>()) {
       tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else {
       PADDLE_THROW(
           "fill constant op's output only"
@@ -83,7 +83,8 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(int, default 5 (FP32)) "
                  "Output data type")
         .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output");
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
     AddAttr<bool>("force_cpu",
diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index a04c1c1263..120b2ab440 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -16,10 +16,9 @@ limitations under the License. */
 #include <cstring>  // for memcpy
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/jit_kernel.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -174,58 +173,44 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     }
   }
 
-#define INIT_VEC_FUNC                                                     \
-  std::function<void(const int, const T *, T *)> act_gate, act_state;     \
-  std::function<void(const int, const T*, const T*, const T*, T*)> cross; \
-  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");          \
-  auto& act_state_str = ctx.Attr<std::string>("activation");              \
-  if (platform::jit::MayIUse(platform::jit::avx)) {                       \
-    math::VecActivations<T, platform::jit::avx> act_functor;              \
-    act_gate = act_functor(act_gate_str);                                 \
-    act_state = act_functor(act_state_str);                               \
-    cross = math::vec_cross<T, platform::jit::avx>;                       \
-  } else {                                                                \
-    math::VecActivations<T, platform::jit::isa_any> act_functor;          \
-    act_gate = act_functor(act_gate_str);                                 \
-    act_state = act_functor(act_state_str);                               \
-    cross = math::vec_cross<T, platform::jit::isa_any>;                   \
-  }
-
-#define INIT_BASE_INPUT_OUTPUT                        \
-  auto* h0 = ctx.Input<Tensor>("H0");                 \
-  auto* wx = ctx.Input<Tensor>("WeightX");            \
-  auto* wh = ctx.Input<Tensor>("WeightH");            \
-  auto* bias = ctx.Input<Tensor>("Bias");             \
-  auto* xx = ctx.Output<LoDTensor>("XX");             \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");
-
-#define INIT_BASE_SIZES                  \
-  auto x_dims = x->dims();   /* T x M*/  \
-  auto wh_dims = wh->dims(); /* D x 3D*/ \
-  const int total_T = x_dims[0];         \
-  const int M = x_dims[1];               \
-  const int D = wh_dims[0];              \
-  const int D3 = wh_dims[1];             \
-  const int D2 = D * 2;
+#define INIT_BASE_DEFINES                  \
+  auto* x = ctx.Input<LoDTensor>("X");     \
+  auto* wh = ctx.Input<Tensor>("WeightH"); \
+  auto* xx = ctx.Output<LoDTensor>("XX");  \
+  auto x_lod = x->lod();                   \
+  auto x_dims = x->dims();   /* T x M*/    \
+  auto wh_dims = wh->dims(); /* D x 3D*/   \
+  const int total_T = x_dims[0];           \
+  const int D3 = wh_dims[1]
+
+#define INIT_OTHER_DEFINES                                                     \
+  auto* h0 = ctx.Input<Tensor>("H0");                                          \
+  auto* wx = ctx.Input<Tensor>("WeightX");                                     \
+  auto* bias = ctx.Input<Tensor>("Bias");                                      \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                          \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");                              \
+  const int M = x_dims[1];                                                     \
+  const int D = wh_dims[0];                                                    \
+  const int D2 = D * 2;                                                        \
+  const auto& ker = math::jitkernel::KernelPool::Instance()                    \
+                        .template Get<math::jitkernel::GRUKernel<T>,           \
+                                      const std::string&, const std::string&>( \
+                            ctx.Attr<std::string>("gate_activation"),          \
+                            ctx.Attr<std::string>("activation"), D);           \
+  const T* x_data = x->data<T>();                                              \
+  const T* wx_data = wx->data<T>();                                            \
+  const T* wh_data = wh->data<T>();                                            \
+  auto place = ctx.GetPlace();                                                 \
+  T* xx_data = xx->mutable_data<T>(place)
 
   void SeqCompute(const framework::ExecutionContext& ctx) const {
     using DeviceContext = paddle::platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
-    INIT_VEC_FUNC
-
-    auto x_lod = x->lod();
+    INIT_BASE_DEFINES;
+    INIT_OTHER_DEFINES;
     const int N = x_lod[0].size() - 1;
-    const T* x_data = x->data<T>();
     const T* h0_data = h0 ? h0->data<T>() : nullptr;
-    const T* wx_data = wx->data<T>();
-    const T* wh_data = wh->data<T>();
     const T* wh_state_data = wh_data + D * D2;
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
-
+    T* hidden_out_data = hidden_out->mutable_data<T>(place);
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
     math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
                                       xx_data,
@@ -252,14 +237,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       if (h0_data) {
         prev_hidden_data = h0_data + bid * D;
       } else {
-        // W: {W_update, W_reset; W_state}
-        // update gate
-        act_gate(D, xx_data, xx_data);
-        // state gate
-        act_state(D, xx_data + D2, xx_data + D2);
-        // out = a*b
-        blas.VMUL(D, xx_data, xx_data + D2, hidden_out_data);
-        // save prev
+        ker->ComputeH1(xx_data, hidden_out_data);
         prev_hidden_data = hidden_out_data;
         tstart = 1;
         move_step();
@@ -269,17 +247,12 @@ class FusionGRUKernel : public framework::OpKernel<T> {
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
                   prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
                   D3);
-        act_gate(D2, xx_data, xx_data);
-        // rt = rt*ht_1 inplace result
-        blas.VMUL(D, prev_hidden_data, xx_data + D, hidden_out_data);
-
+        ker->ComputeHtPart1(xx_data, prev_hidden_data, hidden_out_data);
         // gemm rt * Ws
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
                   hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
                   xx_data + D2, D3);
-        act_state(D, xx_data + D2, xx_data + D2);
-        // out = zt*ht~ + (1-zt)*ht_1
-        cross(D, xx_data, xx_data + D2, prev_hidden_data, hidden_out_data);
+        ker->ComputeHtPart2(xx_data, prev_hidden_data, hidden_out_data);
         // save prev
         prev_hidden_data = hidden_out_data;
         move_step();
@@ -289,28 +262,19 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 
   void BatchCompute(const framework::ExecutionContext& ctx) const {
     using DeviceContext = paddle::platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
-    if (x->lod()[0].size() == 2) {
+    INIT_BASE_DEFINES;
+    if (x_lod[0].size() == 2) {
       xx->Resize({total_T, D3});
       SeqCompute(ctx);
       return;
     }
-    INIT_VEC_FUNC
-
+    INIT_OTHER_DEFINES;
     auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
     auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
     auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");
-
-    const T* x_data = x->data<T>();
-    const T* wx_data = wx->data<T>();
-    const T* wh_data = wh->data<T>();
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* batched_input_data = batched_input->mutable_data<T>(ctx.GetPlace());
-    T* batched_out_data = batched_out->mutable_data<T>(ctx.GetPlace());
-    hidden_out->mutable_data<T>(ctx.GetPlace());
-
+    T* batched_input_data = batched_input->mutable_data<T>(place);
+    T* batched_out_data = batched_out->mutable_data<T>(place);
+    hidden_out->mutable_data<T>(place);
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
@@ -336,7 +300,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     T* prev_hidden_data = nullptr;
     if (h0) {
       // reorder h0
-      T* reordered_h0_data = reordered_h0->mutable_data<T>(ctx.GetPlace());
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
       const T* h0_data = h0->data<T>();
       prev_hidden_data = reordered_h0_data;
       size_t sz = sizeof(T) * D;
@@ -350,12 +314,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       T* cur_out_data = batched_out_data;
       // W: {W_update, W_reset; W_state}
       for (int i = 0; i < max_bs; ++i) {
-        // update gate
-        act_gate(D, cur_in_data, cur_in_data);
-        // state gate
-        act_state(D, cur_in_data + D2, cur_in_data + D2);
-        // out = a*b
-        blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data);
+        ker->ComputeH1(cur_in_data, cur_out_data);
         // add offset
         cur_in_data += D3;
         cur_out_data += D;
@@ -380,10 +339,8 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       T* cur_out_data = batched_out_data;
       T* cur_prev_hidden_data = prev_hidden_data;
       for (int i = 0; i < cur_bs; ++i) {
-        act_gate(D2, cur_batched_data, cur_batched_data);
-        // rt = rt*ht_1 inplace result
-        blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D, cur_out_data);
-
+        ker->ComputeHtPart1(cur_batched_data, cur_prev_hidden_data,
+                            cur_out_data);
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
@@ -397,12 +354,8 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 
       cur_prev_hidden_data = prev_hidden_data;
       for (int i = 0; i < cur_bs; ++i) {
-        // ht~ = act_state(...)
-        act_state(D, cur_batched_data + D2, cur_batched_data + D2);
-        // out = zt*ht~ + (1-zt)*ht_1
-        cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data,
-              cur_out_data);
-
+        ker->ComputeHtPart2(cur_batched_data, cur_prev_hidden_data,
+                            cur_out_data);
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
@@ -416,9 +369,8 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     batched_out->set_lod(batched_lod);
     to_seq(dev_ctx, *batched_out, hidden_out);
   }
-#undef INIT_VEC_FUNC
-#undef INIT_BASE_SIZES
-#undef INIT_BASE_INPUT_OUTPUT
+#undef INIT_OTHER_DEFINES
+#undef INIT_BASE_DEFINES
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 089b541a0a..f84ff206ff 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
-                       ops::GatherOpKernel<int>, ops::GatherOpKernel<double>);
+                       ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
+                       ops::GatherOpKernel<int64_t>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
+                       ops::GatherGradientOpKernel<double>,
                        ops::GatherGradientOpKernel<int>,
-                       ops::GatherGradientOpKernel<double>);
+                       ops::GatherGradientOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 7e014dd1cb..9f4aef08cd 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
-REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
+                        ops::GatherOpCUDAKernel<double>,
+                        ops::GatherOpCUDAKernel<int64_t>,
+                        ops::GatherOpCUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
+                        ops::GatherGradOpCUDAKernel<double>,
+                        ops::GatherGradOpCUDAKernel<int64_t>,
+                        ops::GatherGradOpCUDAKernel<int>);
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 1488aab192..c70d5b8bc7 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -52,7 +52,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of GaussianRandomOp should not be null.");
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     std::vector<int64_t> temp;
     temp.reserve(shape.size());
     for (auto dim : shape) {
@@ -88,9 +88,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddOutput("Out", "Output matrix of gaussian random op");
 
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "The dimension of random tensor.");
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) "
+                                  "The dimension of random tensor.");
     AddAttr<float>("mean",
                    "(float, default 0.0) "
                    "mean of random tensor.")
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
new file mode 100644
index 0000000000..7cde7ca462
--- /dev/null
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using DataLayout = platform::DataLayout;
+using ScopedSpatialTransformerDescriptor =
+    platform::ScopedSpatialTransformerDescriptor;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+template <typename T>
+class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace");
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* output = ctx.Output<Tensor>("Output");
+
+    int n = input->dims()[0];
+    int c = input->dims()[1];
+    int h = input->dims()[2];
+    int w = input->dims()[3];
+    const int size[4] = {n, c, h, w};
+
+    const T* input_data = input->data<T>();
+    const T* grid_data = grid->data<T>();
+    T* output_data = output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+
+    ScopedSpatialTransformerDescriptor st_desc;
+    cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
+        st_desc.descriptor<T>(4, size);
+
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        DataLayout::kNCHW, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        DataLayout::kNCHW, framework::vectorize2int(output->dims()));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerForward(
+        handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
+        input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
+        output_data));
+  }
+};
+
+template <typename T>
+class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace");
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
+
+    auto output_grad_dims = output_grad->dims();
+    const int n = output_grad_dims[0];
+    const int c = output_grad_dims[1];
+    const int h = output_grad_dims[2];
+    const int w = output_grad_dims[3];
+    const int size[4] = {n, c, h, w};
+
+    ScopedSpatialTransformerDescriptor st_dest;
+    cudnnSpatialTransformerDescriptor_t cudnn_st_dest =
+        st_dest.descriptor<T>(4, size);
+
+    const T* input_data = input->data<T>();
+    const T* grid_data = grid->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    T* input_grad_data =
+        input_grad->mutable_data<T>(output_grad_dims, ctx.GetPlace());
+    T* grid_grad_data =
+        grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor input_grad_desc;
+    ScopedTensorDescriptor output_grad_desc;
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        DataLayout::kNCHW, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t cudnn_input_grad_desc =
+        input_grad_desc.descriptor<T>(
+            DataLayout::kNCHW, framework::vectorize2int(input_grad->dims()));
+    cudnnTensorDescriptor_t cudnn_output_grad_desc =
+        output_grad_desc.descriptor<T>(
+            DataLayout::kNCHW, framework::vectorize2int(output_grad->dims()));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerBackward(
+        handle, cudnn_st_dest, CudnnDataType<T>::kOne(), cudnn_input_desc,
+        input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
+        input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
+        output_grad_data, grid_data, CudnnDataType<T>::kZero(),
+        grid_grad_data));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(grid_sampler, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNGridSampleOpKernel<float>,
+                   paddle::operators::CUDNNGridSampleOpKernel<double>);
+REGISTER_OP_KERNEL(grid_sampler_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNGridSampleGradOpKernel<float>,
+                   paddle::operators::CUDNNGridSampleGradOpKernel<double>);
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
new file mode 100644
index 0000000000..e76eb6893b
--- /dev/null
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -0,0 +1,203 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/grid_sampler_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class GridSampleOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GridSampleOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grid"),
+                   "Input(Grid) of GridSampleOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output(Output) of GridSampleOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto grid_dims = ctx->GetInputDim("Grid");
+    PADDLE_ENFORCE(x_dims.size() == 4,
+                   "Input(X) of GridSampleOp should be 4-D Tensor.");
+    PADDLE_ENFORCE(grid_dims.size() == 4,
+                   "Input(Grid) of GridSampleOp should be 4-D Tensor.");
+    PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
+    PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
+                      "Input(X) and Input(Grid) dims[0] should be equal.");
+    PADDLE_ENFORCE_EQ(
+        grid_dims[1], x_dims[2],
+        "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
+    PADDLE_ENFORCE_EQ(
+        grid_dims[2], x_dims[3],
+        "Input(X) dims[3] and Input(Grid) dims[2] should be equal.");
+
+    ctx->SetOutputDim("Output", x_dims);
+    ctx->ShareLoD("X", "Output");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        framework::DataLayout::kAnyLayout, library_);
+  }
+};
+
+class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input data of GridSampleOp, "
+             "This is a 4-D tensor with shape of [N, C, H, W]");
+    AddInput(
+        "Grid",
+        "(Tensor) The input grid of GridSampleOp generated by AffineGridOp, "
+        "This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation "
+        "of x and y coordinates with shape [N, H, W] in last dimention");
+    AddOutput("Output", "(Tensor) Output tensor with shape [N, C, H, W]");
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default true) Only used in cudnn kernel, need install cudnn")
+        .SetDefault(true);
+
+    AddComment(R"DOC(
+      This operation samples input X by using bilinear interpolation based on 
+      flow field grid, which is usually gennerated by affine_grid. The grid of
+      shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
+      with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
+      (in width dimension) of input data x and grid_y is indexng the 3rd 
+      dimention (in height dimension), finally results is the bilinear 
+      interpolation value of 4 nearest corner points.
+
+      Step 1:
+        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+
+        grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+        grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+
+      Step 2:
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear 
+        interpolate point value by 4 nearest points.
+
+          wn ------- y_n ------- en
+          |           |           |
+          |          d_n          |
+          |           |           |
+         x_w --d_w-- grid--d_e-- x_e
+          |           |           |
+          |          d_s          |
+          |           |           |
+          ws ------- y_s ------- wn
+
+        x_w = floor(x)              // west side x coord
+        x_e = x_w + 1               // east side x coord
+        y_n = floor(y)              // north side y coord
+        y_s = y_s + 1               // south side y coord
+
+        d_w = grid_x - x_w          // distance to west side
+        d_e = x_e - grid_x          // distance to east side
+        d_n = grid_y - y_n          // distance to north side
+        d_s = y_s - grid_y          // distance to south side
+
+        wn = X[:, :, y_n, x_w]      // north-west point value
+        en = X[:, :, y_n, x_e]      // north-east point value
+        ws = X[:, :, y_s, x_w]      // south-east point value
+        es = X[:, :, y_s, x_w]      // north-east point value
+
+        output = wn * d_e * d_s + en * d_w * d_s
+               + ws * d_e * d_n + es * d_w * d_n
+        )DOC");
+  }
+};
+
+class GridSampleOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto input_dims = ctx->GetInputDim("X");
+    auto grid_dims = ctx->GetInputDim("Grid");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), input_dims);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Grid"))) {
+      ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        framework::DataLayout::kAnyLayout, library_);
+  }
+};
+
+class GridSampleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("grid_sampler_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Grid", Input("Grid"));
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Grid"), InputGrad("Grid"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker,
+                  ops::GridSampleGradMaker);
+REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    grid_sampler,
+    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GridSampleOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    grid_sampler_grad,
+    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GridSampleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
new file mode 100644
index 0000000000..0d5874fc0c
--- /dev/null
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -0,0 +1,322 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+using Array3 = Eigen::DSizes<int64_t, 3>;
+using Array4 = Eigen::DSizes<int64_t, 4>;
+
+template <typename T>
+static inline bool isInBound(T x, T y, T x_max, T y_max) {
+  if (x < 0 || x > x_max || y < 0 || y > y_max) {
+    return false;
+  }
+  return true;
+}
+
+template <typename T>
+static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
+                              const Tensor& grid, Tensor* x_w, Tensor* x_e,
+                              Tensor* y_n, Tensor* y_s, Tensor* d_w,
+                              Tensor* d_e, Tensor* d_n, Tensor* d_s) {
+  auto& place = *ctx.eigen_device();
+  const int n = grid.dims()[0];
+  const int h = grid.dims()[1];
+  const int w = grid.dims()[2];
+  const T x_max = static_cast<T>(w - 1);
+  const T y_max = static_cast<T>(h - 1);
+
+  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
+  Tensor grid_x, grid_y;
+  T* grid_x_data = grid_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
+  T* grid_y_data = grid_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
+  const T* grid_data = grid.data<T>();
+  for (int i = 0; i < n * h * w; i++) {
+    grid_x_data[i] = grid_data[2 * i];
+    grid_y_data[i] = grid_data[(2 * i) + 1];
+  }
+
+  Tensor ones;
+  ones.mutable_data<T>({n, h, w}, ctx.GetPlace());
+  auto ones_t = EigenTensor<T, 3>::From(ones).setConstant(1.0);
+
+  // scale grid to [0, h-1/w-1]
+  auto grid_x_t = EigenTensor<T, 3>::From(grid_x);
+  auto grid_y_t = EigenTensor<T, 3>::From(grid_y);
+  grid_x_t.device(place) = 0.5 * ((grid_x_t + ones_t) * x_max);
+  grid_y_t.device(place) = 0.5 * ((grid_y_t + ones_t) * y_max);
+
+  // calculate coords of 4 corner points
+  x_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  x_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  y_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  y_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  auto x_w_t = EigenTensor<T, 3>::From(*x_w);
+  auto x_e_t = EigenTensor<T, 3>::From(*x_e);
+  auto y_n_t = EigenTensor<T, 3>::From(*y_n);
+  auto y_s_t = EigenTensor<T, 3>::From(*y_s);
+  x_w_t.device(place) = grid_x_t.floor();
+  x_e_t.device(place) = x_w_t + ones_t;
+  y_n_t.device(place) = grid_y_t.floor();
+  y_s_t.device(place) = y_n_t + ones_t;
+
+  // calculate distances to 4 sides
+  d_w->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  d_e->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  d_n->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  d_s->mutable_data<T>({n, h, w}, ctx.GetPlace());
+  auto d_w_t = EigenTensor<T, 3>::From(*d_w);
+  auto d_e_t = EigenTensor<T, 3>::From(*d_e);
+  auto d_n_t = EigenTensor<T, 3>::From(*d_n);
+  auto d_s_t = EigenTensor<T, 3>::From(*d_s);
+  d_w_t.device(place) = grid_x_t - x_w_t;
+  d_e_t.device(place) = x_e_t - grid_x_t;
+  d_n_t.device(place) = grid_y_t - y_n_t;
+  d_s_t.device(place) = y_s_t - grid_y_t;
+}
+
+template <typename T>
+static void GetGridPointValue(const Tensor& input, Tensor* output,
+                              const Tensor& x, const Tensor& y) {
+  const int n = input.dims()[0];
+  const int c = input.dims()[1];
+  const int h = input.dims()[2];
+  const int w = input.dims()[3];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto output_t = EigenTensor<T, 4>::From(*output).setConstant((T)0);
+  auto input_t = EigenTensor<T, 4>::From(input);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < h; k++) {
+      for (int l = 0; l < w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
+          for (int j = 0; j < c; j++) {
+            output_t(i, j, k, l) =
+                input_t(i, j, static_cast<int>(round(y_t(i, k, l))),
+                        static_cast<int>(round(x_t(i, k, l))));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void GatherOutputGradToInputGrad(const Tensor& output_grad,
+                                        Tensor* input_grad, const Tensor& x,
+                                        const Tensor& y, const Tensor& d1,
+                                        const Tensor& d2) {
+  const int n = output_grad.dims()[0];
+  const int c = output_grad.dims()[1];
+  const int h = output_grad.dims()[2];
+  const int w = output_grad.dims()[3];
+  auto x_t = EigenTensor<T, 3>::From(x);
+  auto y_t = EigenTensor<T, 3>::From(y);
+  auto d1_t = EigenTensor<T, 3>::From(d1);
+  auto d2_t = EigenTensor<T, 3>::From(d2);
+  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
+  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+
+  for (int i = 0; i < n; i++) {
+    for (int k = 0; k < h; k++) {
+      for (int l = 0; l < w; l++) {
+        if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(w - 1), (T)(h - 1))) {
+          for (int j = 0; j < c; j++) {
+            input_grad_t(i, j, static_cast<int>(round(y_t(i, k, l))),
+                         static_cast<int>(round(x_t(i, k, l)))) +=
+                output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class GridSampleOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+
+    // calc locations and distances of 4 corner points
+    Tensor x_w, x_e, y_n, y_s;
+    Tensor d_w, d_e, d_n, d_s;
+    CalcGridLocations<T>(
+        ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
+        &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
+
+    auto* output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), output,
+        static_cast<T>(0));
+
+    // calc 4 corner points value
+    Tensor v_wn, v_en, v_ws, v_es;
+    v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
+    GetGridPointValue<T>(*input, &v_en, x_e, y_n);
+    GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
+    GetGridPointValue<T>(*input, &v_es, x_e, y_s);
+
+    auto d_w_t = EigenTensor<T, 3>::From(d_w);
+    auto d_e_t = EigenTensor<T, 3>::From(d_e);
+    auto d_n_t = EigenTensor<T, 3>::From(d_n);
+    auto d_s_t = EigenTensor<T, 3>::From(d_s);
+    auto d_w_scaled_t =
+        d_w_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
+    auto d_e_scaled_t =
+        d_e_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
+    auto d_n_scaled_t =
+        d_n_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
+    auto d_s_scaled_t =
+        d_s_t.reshape(Array4(n, 1, h, w)).broadcast(Array4(1, c, 1, 1));
+    auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+    auto v_en_t = EigenTensor<T, 4>::From(v_en);
+    auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+    auto v_es_t = EigenTensor<T, 4>::From(v_es);
+    auto output_t = EigenTensor<T, 4>::From(*output);
+    // bilinear interpolaetion by 4 corner points
+    output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t +
+                             v_en_t * d_w_scaled_t * d_s_scaled_t +
+                             v_ws_t * d_e_scaled_t * d_n_scaled_t +
+                             v_es_t * d_w_scaled_t * d_n_scaled_t;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GridSampleGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), input_grad,
+        static_cast<T>(0));
+    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
+    grid_grad->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), grid_grad,
+        static_cast<T>(0));
+
+    Tensor x_w, x_e, y_n, y_s;
+    Tensor d_w, d_e, d_n, d_s;
+    CalcGridLocations<T>(
+        ctx.template device_context<platform::CPUDeviceContext>(), *grid, &x_w,
+        &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s);
+
+    // gather output grad value to input grad by corner point coords and weight
+    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_n, d_e,
+                                   d_s);
+    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_w, y_s, d_e,
+                                   d_n);
+    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_n, d_w,
+                                   d_s);
+    GatherOutputGradToInputGrad<T>(*output_grad, input_grad, x_e, y_s, d_w,
+                                   d_n);
+
+    // calc 4 corner points value
+    Tensor v_wn, v_en, v_ws, v_es;
+    v_wn.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    v_en.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    v_ws.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    v_es.mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    GetGridPointValue<T>(*input, &v_wn, x_w, y_n);
+    GetGridPointValue<T>(*input, &v_en, x_e, y_n);
+    GetGridPointValue<T>(*input, &v_ws, x_w, y_s);
+    GetGridPointValue<T>(*input, &v_es, x_e, y_s);
+    auto v_wn_t = EigenTensor<T, 4>::From(v_wn);
+    auto v_en_t = EigenTensor<T, 4>::From(v_en);
+    auto v_ws_t = EigenTensor<T, 4>::From(v_ws);
+    auto v_es_t = EigenTensor<T, 4>::From(v_es);
+
+    auto d_w_t = EigenTensor<T, 3>::From(d_w);
+    auto d_e_t = EigenTensor<T, 3>::From(d_e);
+    auto d_n_t = EigenTensor<T, 3>::From(d_n);
+    auto d_s_t = EigenTensor<T, 3>::From(d_s);
+
+    auto output_grad_t = EigenTensor<T, 4>::From(*output_grad);
+
+    Tensor grid_grad_x, grid_grad_y;
+    grid_grad_x.mutable_data<T>({n, h, w}, ctx.GetPlace());
+    grid_grad_y.mutable_data<T>({n, h, w}, ctx.GetPlace());
+    auto grid_grad_x_t = EigenTensor<T, 3>::From(grid_grad_x).setConstant(0.0);
+    auto grid_grad_y_t = EigenTensor<T, 3>::From(grid_grad_y).setConstant(0.0);
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            grid_grad_x_t(i, k, l) +=
+                ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) +
+                 (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) *
+                output_grad_t(i, j, k, l);
+            grid_grad_y_t(i, k, l) +=
+                ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) +
+                 (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) *
+                output_grad_t(i, j, k, l);
+          }
+        }
+      }
+    }
+    const T x_max = static_cast<T>(w - 1);
+    const T y_max = static_cast<T>(h - 1);
+    grid_grad_x_t = grid_grad_x_t * (x_max / (T)2);
+    grid_grad_y_t = grid_grad_y_t * (y_max / (T)2);
+
+    // gather grid_grad [x, y] in 3rd Dim
+    T* grid_grad_data = grid_grad->data<T>();
+    T* grid_grad_x_data = grid_grad_x.data<T>();
+    T* grid_grad_y_data = grid_grad_y.data<T>();
+    for (int i = 0; i < n * h * w; i++) {
+      grid_grad_data[2 * i] = grid_grad_x_data[i];
+      grid_grad_data[2 * i + 1] = grid_grad_y_data[i];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
new file mode 100644
index 0000000000..b9ebe71a3d
--- /dev/null
+++ b/paddle/fluid/operators/hash_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/hash_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class HashOp : public framework::OperatorWithKernel {
+ public:
+  HashOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of HashOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of HashOp should not be null.");
+
+    auto dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(dims.size(), 2UL,
+                      "The input of hash_op's dimensions must be 2");
+    std::vector<int64_t> out_dims;
+    out_dims.reserve(dims.size() + 1);
+    // copy all dims except the last one
+    for (size_t i = 0u; i != dims.size() - 1; ++i) {
+      out_dims.emplace_back(dims[i]);
+    }
+    int num_hash = ctx->Attrs().Get<int>("num_hash");
+    out_dims.emplace_back(num_hash);
+    // keep the last dim to 1
+    out_dims.emplace_back(1);
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class HashOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+**Hash Operator**
+$$Out = scale * X$$
+)DOC");
+    AddAttr<int>("num_hash", "").SetDefault(1);
+    AddAttr<int>("mod_by", "").SetDefault(100000);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(hash, ops::HashOp, ops::HashOpMaker);
+REGISTER_OP_CPU_KERNEL(hash, ops::HashKerel<int>, ops::HashKerel<int64_t>);
diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h
new file mode 100644
index 0000000000..9781bb0f45
--- /dev/null
+++ b/paddle/fluid/operators/hash_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+extern "C" {
+#include <xxhash.h>
+}
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+// template <typename DeviceContext, typename T>
+template <typename T>
+class HashKerel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out_t = context.Output<framework::LoDTensor>("Out");
+    auto* in_t = context.Input<framework::LoDTensor>("X");
+    int mod_by = context.Attr<int>("mod_by");
+    int num_hash = context.Attr<int>("num_hash");
+    auto* output = out_t->mutable_data<T>(context.GetPlace());
+
+    auto in_dims = in_t->dims();
+    auto in_lod = in_t->lod();
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    auto seq_length = in_dims[0];
+    auto last_dim = in_dims[in_dims.size() - 1];
+    auto* input = in_t->data<T>();
+    for (int idx = 0; idx < seq_length; ++idx) {
+      for (int ihash = 0; ihash != num_hash; ++ihash) {
+        output[idx * num_hash + ihash] =
+            XXH64(input, sizeof(int) * last_dim, ihash) % mod_by;
+      }
+      input += last_dim;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lars_momentum_op.cc b/paddle/fluid/operators/lars_momentum_op.cc
new file mode 100644
index 0000000000..a8dda93902
--- /dev/null
+++ b/paddle/fluid/operators/lars_momentum_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lars_momentum_op.h"
+#include "paddle/fluid/operators/momentum_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Param",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Input parameter that has to be updated");
+    AddInput("Grad",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Input gradient of the parameter");
+    AddInput("Velocity",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Input velocity (corresponding to the parameter) "
+             "that has to be updated");
+    AddInput("LearningRate",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Input learning rate");
+
+    AddOutput("ParamOut",
+              "(LoDTensor) This output is updated parameter. "
+              "It shared memory with Input(Param).");
+    AddOutput("VelocityOut",
+              "(LoDTensor) This output is updated velocity. "
+              "It shared memory with Input(Velocity).");
+
+    AddAttr<float>("mu", "(float) Momentum coefficient");
+    AddAttr<float>("lars_coeff", "(float, default 0.001) LARS coefficient.")
+        .SetDefault(0.001);
+    AddAttr<float>("lars_weight_decay",
+                   "(float, default 0.0005) LARS weight decay")
+        .SetDefault(0.0005);
+
+    AddComment(R"DOC(
+Lars Momentum Optimizer.
+
+This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
+weight using a local learning rate:
+
+$$
+local\_lr = \eta  * 
+    \frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
+velocity = mu * velocity + 
+    local\_lr * (grad + \beta * param) \\
+param = param - velocity. \\
+$$
+
+Note that we use lars_weight_decay here to decay weights, you may need not to
+use L2 regularizers in case of using LARS.
+
+)DOC");
+  }
+};
+
+class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lars_momentum, ops::MomentumOp, ops::LarsMomentumOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::LarsMomentumOpVarTypeInference);
+REGISTER_OP_CPU_KERNEL(lars_momentum, ops::LarsMomentumOpKernel<float>,
+                       ops::LarsMomentumOpKernel<double>);
diff --git a/paddle/fluid/operators/lars_momentum_op.cu b/paddle/fluid/operators/lars_momentum_op.cu
new file mode 100644
index 0000000000..eb346851a2
--- /dev/null
+++ b/paddle/fluid/operators/lars_momentum_op.cu
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/lars_momentum_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
+                                   const T* learning_rate, const T mu,
+                                   const int64_t num, const T lars_coeff,
+                                   const T lars_weight_decay, const T* p_norm,
+                                   const T* g_norm, T* p_out, T* v_out) {
+  T lr = learning_rate[0];
+  T local_lr = learning_rate[0];
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    if (p_norm[0] > 0 && g_norm[0] > 0) {
+      local_lr = lr * lars_coeff * p_norm[0] /
+                 (g_norm[0] + lars_weight_decay * p_norm[0]);
+    }
+    T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
+    v_out[i] = v_new;
+    p_out[i] = p[i] - v_new;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
+    auto param = ctx.Input<framework::LoDTensor>("Param");
+    auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
+    auto grad = ctx.Input<framework::LoDTensor>("Grad");
+    auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
+
+    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
+    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    T lars_coeff = ctx.Attr<float>("lars_coeff");
+    T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+
+    auto* p = param->data<T>();
+    auto* v = velocity->data<T>();
+    auto* g = grad->data<T>();
+    auto* lr = learning_rate->data<T>();
+
+    int block = 512;
+    int grid = (param->numel() + block - 1) / block;
+
+    auto eigen_p = framework::EigenVector<T>::Flatten(*param);
+    auto eigen_g = framework::EigenVector<T>::Flatten(*grad);
+    // calculate norms using eigein and launch the kernel.
+    framework::Tensor p_norm_t, g_norm_t;
+    p_norm_t.Resize({1});
+    g_norm_t.Resize({1});
+    auto* p_norm_data = p_norm_t.mutable_data<T>(ctx.GetPlace());
+    auto* g_norm_data = g_norm_t.mutable_data<T>(ctx.GetPlace());
+    auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
+    auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
+
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+    ep_norm.device(*place) = eigen_p.square().sum().sqrt();
+    eg_norm.device(*place) = eigen_g.square().sum().sqrt();
+    MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+        p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
+        p_norm_data, g_norm_data, p_out, v_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lars_momentum,
+    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lars_momentum_op.h b/paddle/fluid/operators/lars_momentum_op.h
new file mode 100644
index 0000000000..e85be99fc4
--- /dev/null
+++ b/paddle/fluid/operators/lars_momentum_op.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class LarsMomentumOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
+    auto param = ctx.Input<framework::LoDTensor>("Param");
+    auto velocity = ctx.Input<framework::LoDTensor>("Velocity");
+    auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
+    auto* grad_var = ctx.InputVar("Grad");
+    // only support dense for now.
+    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>());
+    auto grad = ctx.Input<framework::LoDTensor>("Grad");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    T lars_coeff = ctx.Attr<float>("lars_coeff");
+    T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+
+    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
+    auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
+
+    auto p = framework::EigenVector<T>::Flatten(*param);
+    auto v = framework::EigenVector<T>::Flatten(*velocity);
+    auto g = framework::EigenVector<T>::Flatten(*grad);
+    auto* lr = learning_rate->data<T>();
+
+    framework::Tensor p_norm_t, g_norm_t;
+    p_norm_t.Resize({1});
+    g_norm_t.Resize({1});
+    p_norm_t.mutable_data<T>(ctx.GetPlace());
+    g_norm_t.mutable_data<T>(ctx.GetPlace());
+    auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
+    auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
+
+    ep_norm = p.square().sum().sqrt();
+    eg_norm = g.square().sum().sqrt();
+    T local_lr = lr[0];
+    if (ep_norm(0) > 0 && eg_norm(0) > 0) {
+      local_lr = lr[0] * lars_coeff * ep_norm(0) /
+                 (eg_norm(0) + lars_weight_decay * ep_norm(0));
+    }
+    v_out = v * mu + local_lr * (g + lars_weight_decay * p);
+    p_out = p - v_out;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 26f09c46c2..a038bad701 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -27,6 +27,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 
+DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send");
+DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get");
+DEFINE_int32(rpc_prefetch_thread_num, 5, "number of threads for rpc prefetch");
+
 namespace paddle {
 namespace operators {
 
@@ -332,11 +336,14 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       sync_mode, checkpoint_block_id));
 
   rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get());
+                            request_send_handler_.get(),
+                            FLAGS_rpc_send_thread_num);
   rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get());
+                            request_get_handler_.get(),
+                            FLAGS_rpc_get_thread_num);
   rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
-                            request_prefetch_handler_.get());
+                            request_prefetch_handler_.get(),
+                            FLAGS_rpc_prefetch_thread_num);
   rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
                             request_checkpoint_handler_.get());
 
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 8eab83fcd2..e72337a3e6 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/port.h"
 
@@ -79,7 +79,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
 template <typename DeviceContext>
 template <typename T>
 void LoDTensorToArrayFunctorImpl<DeviceContext>::apply() {
-  math::ConcatGradFunctor<DeviceContext, T> func;
+  math::SplitFunctor<DeviceContext, T> func;
   func(*dev_ctx_, prev_functor_->input_, prev_functor_->ref_inputs_, 0,
        &prev_functor_->outputs_);
 }
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index b9ac54e446..3226a727b1 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -81,6 +81,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
         .SetDefault(kNoPadding);
+    // NOTE(minqiyang): grad_inplace is an temporal attribute,
+    // please do NOT set this attribute in python layer.
+    AddAttr<bool>("grad_inplace",
+                  "(boolean, default false) "
+                  "If the grad op reuse the input's variable.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Lookup Table Operator.
 
@@ -115,7 +121,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Out"));
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 58463dc4d6..e504c4f0cd 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -68,6 +69,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
       const auto *table = table_t.value().data<T>();
       auto *output = output_t->mutable_data<T>(context.GetPlace());
 
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
       for (int64_t i = 0; i < ids_numel; ++i) {
         if (padding_idx != kNoPadding && ids[i] == padding_idx) {
           memset(output + i * row_width, 0, row_width * sizeof(T));
@@ -75,8 +77,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
           PADDLE_ENFORCE_GE(ids[i], 0);
           auto id_index = table_t.Index(ids[i]);
           PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
-          memcpy(output + i * row_width, table + id_index * row_width,
-                 row_width * sizeof(T));
+          blas.VCOPY(row_width, table + id_index * row_width,
+                     output + i * row_width);
         }
       }
     }
@@ -111,27 +113,37 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *ids_data = ids->data<int64_t>();
       int64_t ids_num = ids->numel();
 
-      framework::Vector<int64_t> new_rows;
-      new_rows.reserve(ids_num);
-      for (int64_t i = 0; i < ids_num; i++) {
-        new_rows.push_back(ids_data[i]);
-      }
+      std::vector<int64_t> new_rows;
+      new_rows.resize(ids_num);
+      std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
       d_table_value->Resize({ids_num, table_dim[1]});
-      d_table_value->mutable_data<T>(context.GetPlace());
-
-      d_table->set_height(table_dim[0]);
-
-      auto *d_output_data = d_output->data<T>();
-      auto *d_table_data = d_table_value->data<T>();
-
-      auto d_output_dims = d_output->dims();
-      PADDLE_ENFORCE_EQ(
-          d_table_value->dims(),
-          framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
-      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+      // FIXME(minqiyang):
+      // memory optimization will NOT reuse Tensor with SelectedRows
+      // so we could just share the tensor here directly.
+      // However, the InferVarType method will infer the output SelectedRows
+      // to Tensor sometimes, which is a bug, so we will add an attribute
+      // here to indicate the inplace and remove this attribute after
+      // the InferVarType's bug was fixed
+      bool grad_inplace = context.Attr<bool>("grad_inplace");
+      if (grad_inplace) {
+        d_table_value->ShareDataWith(*d_output);
+      } else {
+        d_table_value->mutable_data<T>(context.GetPlace());
+
+        d_table->set_height(table_dim[0]);
+
+        auto *d_output_data = d_output->data<T>();
+        auto *d_table_data = d_table_value->data<T>();
+
+        auto d_output_dims = d_output->dims();
+        PADDLE_ENFORCE_EQ(
+            d_table_value->dims(),
+            framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
+        memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+      }
     } else {
       auto *ids = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index c7bdec3547..17b675fba8 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (NOT WIN32)
-add_subdirectory(detail)
+    add_subdirectory(detail)
 endif(NOT WIN32)
 
 function(math_library TARGET)
@@ -35,7 +35,7 @@ function(math_library TARGET)
 endfunction()
 
 # please add new math_library in alphabetical order
-math_library(concat)
+math_library(concat_and_split)
 math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
@@ -43,8 +43,8 @@ math_library(depthwise_conv)
 math_library(im2col)
 
 if (NOT WIN32) # windows do not support avx functions yet.
-math_library(gru_compute DEPS activation_functions math_function)
-math_library(lstm_compute DEPS activation_functions)
+    math_library(gru_compute DEPS activation_functions math_function)
+    math_library(lstm_compute DEPS activation_functions)
 endif (NOT WIN32)
 
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
@@ -58,7 +58,7 @@ math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
 if (NOT WIN32)
-math_library(matrix_bit_code)
+    math_library(matrix_bit_code)
 endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
@@ -68,13 +68,14 @@ cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selec
 cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
+cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
 if(WITH_GPU)
     nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
 endif()
-cc_test(concat_test SRCS concat_test.cc DEPS concat)
+cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 cc_library(jit_kernel 
-    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_lstm.cc
+    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
     DEPS cpu_info cblas)
 cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h
index 262469beea..2e75b6abce 100644
--- a/paddle/fluid/operators/math/algorithm.h
+++ b/paddle/fluid/operators/math/algorithm.h
@@ -39,6 +39,52 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
   return -1;
 }
 
+template <typename T>
+HOSTDEVICE inline size_t LowerBound(const T *x, size_t num, const T &val) {
+#ifdef __CUDA_ARCH__
+  // The following code is from
+  // https://en.cppreference.com/w/cpp/algorithm/lower_bound
+  auto *first = x;
+  int64_t count = static_cast<int64_t>(num);
+  while (count > 0) {
+    int64_t step = (count >> 1);
+    auto *it = first + step;
+    if (*it < val) {
+      first = ++it;
+      count -= (step + 1);
+    } else {
+      count = step;
+    }
+  }
+  return static_cast<size_t>(first - x);
+#else
+  return static_cast<size_t>(std::lower_bound(x, x + num, val) - x);
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline size_t UpperBound(const T *x, size_t num, const T &val) {
+#ifdef __CUDA_ARCH__
+  // The following code is from
+  // https://en.cppreference.com/w/cpp/algorithm/upper_bound
+  auto *first = x;
+  int64_t count = static_cast<int64_t>(num);
+  while (count > 0) {
+    auto step = (count >> 1);
+    auto *it = first + step;
+    if (val < *it) {
+      count = step;
+    } else {
+      first = ++it;
+      count -= (step + 1);
+    }
+  }
+  return static_cast<size_t>(first - x);
+#else
+  return static_cast<size_t>(std::upper_bound(x, x + num, val) - x);
+#endif
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat_and_split.cc
similarity index 95%
rename from paddle/fluid/operators/math/concat.cc
rename to paddle/fluid/operators/math/concat_and_split.cc
index 7b79f10e33..c6e17fd042 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include <vector>
 
 namespace paddle {
@@ -67,7 +67,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
  * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
-class ConcatGradFunctor<platform::CPUDeviceContext, T> {
+class SplitFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
@@ -111,7 +111,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
 };
 #define DEFINE_FUNCTOR(type)                                      \
   template class ConcatFunctor<platform::CPUDeviceContext, type>; \
-  template class ConcatGradFunctor<platform::CPUDeviceContext, type>;
+  template class SplitFunctor<platform::CPUDeviceContext, type>;
 
 FOR_ALL_TYPES(DEFINE_FUNCTOR);
 
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat_and_split.cu
similarity index 90%
rename from paddle/fluid/operators/math/concat.cu
rename to paddle/fluid/operators/math/concat_and_split.cu
index b59d86e661..760a065c10 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -24,7 +24,7 @@ namespace operators {
 namespace math {
 
 template <typename T>
-__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
+__global__ void ConcatKernel(T** inputs, const int* input_cols, int col_size,
                              const int output_rows, const int output_cols,
                              T* output) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -50,7 +50,7 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
 }
 
 template <typename T>
-__global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
+__global__ void ConcatKernel(T** inputs_data, const int fixed_in_col,
                              const int out_rows, const int out_cols,
                              T* output_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -67,9 +67,9 @@ __global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
 }
 
 template <typename T>
-__global__ void KernelConcatGrad(const T* input_data, const int in_row,
-                                 const int in_col, const int* out_cols,
-                                 int out_cols_size, T** outputs_data) {
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int* out_cols,
+                            int out_cols_size, T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   int curr_segment = 0;
   int curr_offset = out_cols[0];
@@ -94,9 +94,9 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void KernelConcatGrad(const T* input_data, const int in_row,
-                                 const int in_col, const int fixed_out_col,
-                                 T** outputs_data) {
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int fixed_out_col,
+                            T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
     int split = tid_x / fixed_out_col;
@@ -170,11 +170,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
     if (sameShape) {
-      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+      ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, in_col, out_row, out_col, output->data<T>());
     } else {
       const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
-      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+      ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
           out_row, out_col, output->data<T>());
     }
@@ -189,7 +189,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
  * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
-class ConcatGradFunctor<platform::CUDADeviceContext, T> {
+class SplitFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
@@ -248,11 +248,11 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
     if (sameShape) {
-      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+      SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
     } else {
       const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
-      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+      SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, dev_outs_col_data,
           static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
     }
@@ -264,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 
 #define DEFINE_FUNCTOR(type)                                       \
   template class ConcatFunctor<platform::CUDADeviceContext, type>; \
-  template class ConcatGradFunctor<platform::CUDADeviceContext, type>
+  template class SplitFunctor<platform::CUDADeviceContext, type>
 
 FOR_ALL_TYPES(DEFINE_FUNCTOR);
 
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat_and_split.h
similarity index 98%
rename from paddle/fluid/operators/math/concat.h
rename to paddle/fluid/operators/math/concat_and_split.h
index 867a84fa87..3a5eddcbf4 100644
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -54,7 +54,7 @@ class ConcatFunctor {
  *     Output[1] = [[5,6]]
  */
 template <typename DeviceContext, typename T>
-class ConcatGradFunctor {
+class SplitFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index a46f2d51ca..8ba9e8e8ec 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/concat.h"
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 
 template <typename DeviceContext, typename Place>
 void testConcat() {
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index e91e4e8e5a..48e180b1fd 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -142,6 +142,22 @@ class LSTMKernel : public Kernel {
                            const T *wp_data = nullptr) const = 0;
 };
 
+template <typename T>
+class GRUKernel : public Kernel {
+ public:
+  // compute h1 without h0
+  virtual void ComputeH1(T *gates, T *ht) const = 0;
+  virtual void ComputeHtPart1(T *gates, const T *ht_1, T *ht) const = 0;
+  virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0;
+};
+
+template <typename T>
+class CRFDecodeKernel : public Kernel {
+ public:
+  virtual void Compute(const int seq_len, const T *x, const T *w, T *alpha,
+                       int *track) const = 0;
+};
+
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
new file mode 100644
index 0000000000..e481d1921a
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -0,0 +1,296 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include <limits>
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+namespace jit = platform::jit;
+
+/* CRF Decode JitKernel */
+template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
+ public:
+  explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel<T>() {
+    this->num_ = tag_num;
+  }
+  void Compute(const int seq_len, const T* x, const T* w, T* alpha,
+               int* track) const override {
+    constexpr int state_trans_base_idx = 2;
+    for (int i = 0; i < this->num_; ++i) {
+      alpha[i] = w[i] + x[i];
+    }
+    for (int k = 1; k < seq_len; ++k) {
+      for (int i = 0; i < this->num_; ++i) {
+        T max_score = -std::numeric_limits<T>::max();
+        int max_j = 0;
+        for (int j = 0; j < this->num_; ++j) {
+          T score = alpha[(k - 1) * this->num_ + j] +
+                    w[(j + state_trans_base_idx) * this->num_ + i];
+          if (score > max_score) {
+            max_score = score;
+            max_j = j;
+          }
+        }
+        alpha[k * this->num_ + i] = max_score + x[k * this->num_ + i];
+        track[k * this->num_ + i] = max_j;
+      }
+    }
+  }
+};
+
+#define INIT_ALPHA(step_size)                                               \
+  /* Setup the alpha initial value.*/                                       \
+  int i_offset = 0;                                                         \
+  int last_offset = this->rest_ - step_size;                                \
+  for (int i = 0; i <= this->end_; ++i) {                                   \
+    /* weights, input and alpha values. */                                  \
+    __m256 w_content, x_content, alpha_content;                             \
+    /* Load the relevant data into the variables from un-aligned address.*/ \
+    w_content = _mm256_loadu_ps(w + i_offset);                              \
+    x_content = _mm256_loadu_ps(x + i_offset);                              \
+    alpha_content = _mm256_add_ps(w_content, x_content);                    \
+    _mm256_storeu_ps(alpha + i_offset, alpha_content);                      \
+    i_offset += step_size;                                                  \
+    if (i == this->end_ - 1) {                                              \
+      if (this->rest_ > 0) {                                                \
+        i_offset += last_offset;                                            \
+      } else {                                                              \
+        break;                                                              \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#define UPDATE_ALPHA(step_size)                                               \
+  /* Update the alpha and track values. */                                    \
+  __m256 x_content = _mm256_loadu_ps(x + seq_offset + this->num_ + j_offset); \
+  max_score = _mm256_add_ps(max_score, x_content);                            \
+  _mm256_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score);    \
+  _mm256_storeu_si256(                                                        \
+      reinterpret_cast<__m256i*>(track + seq_offset + this->num_ + j_offset), \
+      max_j);                                                                 \
+  /* Calculate the offset of next step*/                                      \
+  j_offset += step_size;                                                      \
+  if (j == this->end_ - 1) {                                                  \
+    if (this->rest_ > 0) {                                                    \
+      j_offset += last_offset;                                                \
+    } else {                                                                  \
+      break;                                                                  \
+    }                                                                         \
+  }
+
+#define INTRIAVX_FLOAT(block)                                                  \
+  template <>                                                                  \
+  CRFDecodeKernelImpl<float, jit::avx, block>::CRFDecodeKernelImpl(            \
+      int tag_num)                                                             \
+      : CRFDecodeKernel<float>() {                                             \
+    this->num_ = tag_num;                                                      \
+    this->end_ = this->num_ / AVX_FLOAT_BLOCK;                                 \
+    this->rest_ = this->num_ % AVX_FLOAT_BLOCK;                                \
+  }                                                                            \
+  template <>                                                                  \
+  void CRFDecodeKernelImpl<float, jit::avx, block>::Compute(                   \
+      const int seq_len, const float* x, const float* w, float* alpha,         \
+      int* track) const {                                                      \
+    INIT_ALPHA(AVX_FLOAT_BLOCK)                                                \
+    /* Use the column-major strategy to get the location of maximum score.*/   \
+    int seq_offset = 0;                                                        \
+    constexpr int state_trans_base_idx = 2;                                    \
+    for (int k = 1; k < seq_len; ++k) {                                        \
+      int j_offset = 0;                                                        \
+      for (int j = 0; j <= this->end_; ++j) {                                  \
+        /* Initialize the variables of maximum score and location.*/           \
+        __m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max()); \
+        __m256i max_j = _mm256_set1_epi32(0);                                  \
+        /* Calculate the offset of transition_weights.*/                       \
+        int trans_offset = state_trans_base_idx * this->num_ + j_offset;       \
+        for (int i = 0; i < this->num_; ++i) {                                 \
+          /* Initalize the content of alpha variable with related offset.*/    \
+          __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i);  \
+          /* Obtain the content of weights from un-aligned address.*/          \
+          __m256 w_content = _mm256_loadu_ps(w + trans_offset);                \
+          __m256 score_v = _mm256_add_ps(alpha_content, w_content);            \
+          __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);         \
+          /* According to the mask value, update the index of the max_score.*/ \
+          /* AVX instructions.*/                                               \
+          __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);               \
+          __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);               \
+          __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);        \
+          __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);        \
+          lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);                      \
+          hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);                      \
+          lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));                 \
+          hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));                 \
+          lo_max_j = _mm_or_si128(lo_mask, lo_max_j);                          \
+          hi_max_j = _mm_or_si128(hi_mask, hi_max_j);                          \
+          max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);                 \
+          max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);                 \
+          /* AVX done*/                                                        \
+          /* Update the max_score value.*/                                     \
+          max_score = _mm256_max_ps(max_score, score_v);                       \
+          trans_offset += this->num_;                                          \
+        }                                                                      \
+        UPDATE_ALPHA(AVX_FLOAT_BLOCK)                                          \
+      }                                                                        \
+      seq_offset += this->num_;                                                \
+    }                                                                          \
+  }
+
+#define INTRIAVX2_FLOAT(isa, block)                                            \
+  template <>                                                                  \
+  CRFDecodeKernelImpl<float, isa, block>::CRFDecodeKernelImpl(int tag_num)     \
+      : CRFDecodeKernel<float>() {                                             \
+    this->num_ = tag_num;                                                      \
+    this->end_ = this->num_ / AVX2_FLOAT_BLOCK;                                \
+    this->rest_ = this->num_ % AVX2_FLOAT_BLOCK;                               \
+  }                                                                            \
+  template <>                                                                  \
+  void CRFDecodeKernelImpl<float, isa, block>::Compute(                        \
+      const int seq_len, const float* x, const float* w, float* alpha,         \
+      int* track) const {                                                      \
+    INIT_ALPHA(AVX2_FLOAT_BLOCK)                                               \
+    /* Use the column-major strategy to get the location of maximum score.*/   \
+    int seq_offset = 0;                                                        \
+    constexpr int state_trans_base_idx = 2;                                    \
+    for (int k = 1; k < seq_len; ++k) {                                        \
+      int j_offset = 0;                                                        \
+      for (int j = 0; j <= this->end_; ++j) {                                  \
+        /* Initialize the variables of maximum score and location.*/           \
+        __m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max()); \
+        __m256i max_j = _mm256_set1_epi32(0);                                  \
+        /* Calculate the offset of transition_weights.*/                       \
+        int trans_offset = state_trans_base_idx * this->num_ + j_offset;       \
+        for (int i = 0; i < this->num_; ++i) {                                 \
+          /* Initalize the content of alpha variable with related offset.*/    \
+          __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i);  \
+          /* Obtain the content of weights from un-aligned address.*/          \
+          __m256 w_content = _mm256_loadu_ps(w + trans_offset);                \
+          __m256 score_v = _mm256_add_ps(alpha_content, w_content);            \
+          __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);         \
+          /* According to the mask value, update the index of the max_score.*/ \
+          /* AVX2 instructions.*/                                              \
+          max_j = _mm256_or_si256(                                             \
+              _mm256_andnot_si256((__m256i)mask, max_j),                       \
+              _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));          \
+          /* Update the max_score value.*/                                     \
+          max_score = _mm256_max_ps(max_score, score_v);                       \
+          trans_offset += this->num_;                                          \
+        }                                                                      \
+        UPDATE_ALPHA(AVX2_FLOAT_BLOCK)                                         \
+      }                                                                        \
+      seq_offset += this->num_;                                                \
+    }                                                                          \
+  }
+
+#define INTRIAVX512_FLOAT(block)                                               \
+  template <>                                                                  \
+  CRFDecodeKernelImpl<float, jit::avx512f, block>::CRFDecodeKernelImpl(        \
+      int tag_num)                                                             \
+      : CRFDecodeKernel<float>() {                                             \
+    this->num_ = tag_num;                                                      \
+    this->end_ = this->num_ / AVX512_FLOAT_BLOCK;                              \
+    this->rest_ = this->num_ % AVX512_FLOAT_BLOCK;                             \
+  }                                                                            \
+  template <>                                                                  \
+  void CRFDecodeKernelImpl<float, jit::avx512f, block>::Compute(               \
+      const int seq_len, const float* x, const float* w, float* alpha,         \
+      int* track) const {                                                      \
+    INIT_ALPHA(AVX512_FLOAT_BLOCK)                                             \
+    /* Use the column-major strategy to get the location of maximum score.*/   \
+    int seq_offset = 0;                                                        \
+    constexpr int state_trans_base_idx = 2;                                    \
+    for (int k = 1; k < seq_len; ++k) {                                        \
+      int j_offset = 0;                                                        \
+      for (int j = 0; j <= this->end_; ++j) {                                  \
+        /* Initialize the variables of maximum score and location.*/           \
+        __m512 max_score = _mm512_set1_ps(-std::numeric_limits<float>::max()); \
+        __m512i max_j = _mm512_setzero_si512();                                \
+        /* Calculate the offset of transition_weights.*/                       \
+        int trans_offset = state_trans_base_idx * this->num_ + j_offset;       \
+        for (int i = 0; i < this->num_; ++i) {                                 \
+          /* Initalize the content of alpha variable with related offset.*/    \
+          __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i));    \
+          /* Obtain the content of weights from un-aligned address.*/          \
+          __m512 w_content = _mm512_loadu_ps(w + trans_offset);                \
+          __m512 score_v = _mm512_add_ps(alpha_content, w_content);            \
+          __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); \
+          /* AVX512 instructions.*/                                            \
+          max_j = _mm512_mask_set1_epi32(max_j, mask, i);                      \
+          /* Update the max_score value.*/                                     \
+          max_score = _mm512_max_ps(max_score, score_v);                       \
+          trans_offset += this->num_;                                          \
+        }                                                                      \
+        /* Update the alpha and track values.*/                                \
+        __m512 x_content =                                                     \
+            _mm512_loadu_ps(x + seq_offset + this->num_ + j_offset);           \
+        max_score = _mm512_add_ps(max_score, x_content);                       \
+        _mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset,           \
+                         max_score);                                           \
+        _mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset +    \
+                                                       this->num_ + j_offset), \
+                            max_j);                                            \
+        /* Calculate the offset of next step*/                                 \
+        j_offset += AVX512_FLOAT_BLOCK;                                        \
+        if (j == this->end_ - 1) {                                             \
+          if (this->rest_ > 0) {                                               \
+            j_offset += last_offset;                                           \
+          } else {                                                             \
+            break;                                                             \
+          }                                                                    \
+        }                                                                      \
+      }                                                                        \
+      seq_offset += this->num_;                                                \
+    }                                                                          \
+  }
+
+#ifdef __AVX__
+INTRIAVX_FLOAT(kEQ8);
+INTRIAVX_FLOAT(kGT8LT16);
+INTRIAVX_FLOAT(kEQ16);
+INTRIAVX_FLOAT(kGT16);
+#endif
+#ifdef __AVX2__
+INTRIAVX2_FLOAT(jit::avx2, kEQ8);
+INTRIAVX2_FLOAT(jit::avx2, kGT8LT16);
+INTRIAVX2_FLOAT(jit::avx2, kEQ16);
+INTRIAVX2_FLOAT(jit::avx2, kGT16);
+#endif
+#ifdef __AVX512F__
+INTRIAVX2_FLOAT(jit::avx512f, kEQ8);
+INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16);
+INTRIAVX512_FLOAT(kEQ16);
+INTRIAVX512_FLOAT(kGT16);
+#endif
+
+#undef INTRIAVX512_FLOAT
+#undef INTRIAVX2_FLOAT
+#undef INTRIAVX_FLOAT
+#undef INIT_ALPHA
+#undef UPDATE_ALPHA
+
+REGISTER_JITKERNEL(crf_decode, CRFDecodeKernel);
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_lstm.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
similarity index 65%
rename from paddle/fluid/operators/math/jit_kernel_lstm.cc
rename to paddle/fluid/operators/math/jit_kernel_rnn.cc
index 26bd26e2e1..fab293f7d0 100644
--- a/paddle/fluid/operators/math/jit_kernel_lstm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -136,6 +136,23 @@ static std::shared_ptr<const VActKernel<T>> GetActKernel(
   return nullptr;
 }
 
+#ifdef __AVX__
+template <jit::cpu_isa_t isa>
+static std::unique_ptr<AVXAct> GetAVXAct(const std::string& type) {
+  if (type == "sigmoid") {
+    return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid, isa>());
+  } else if (type == "relu") {
+    return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu, isa>());
+  } else if (type == "tanh") {
+    return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh, isa>());
+  } else if (type == "identity" || type == "") {
+    return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity, isa>());
+  }
+  PADDLE_THROW("Not support type: %s", type);
+  return nullptr;
+}
+#endif
+
 /* LSTM JitKernel */
 template <typename T, jit::cpu_isa_t isa, jit_block>
 class LSTMKernelImpl : public LSTMKernel<T> {
@@ -192,61 +209,49 @@ class LSTMKernelImpl : public LSTMKernel<T> {
 #endif
 };
 
-#define INTRI8_FLOAT(isa)                                                      \
-  template <>                                                                  \
-  LSTMKernelImpl<float, isa, kEQ8>::LSTMKernelImpl(                            \
-      const std::string& act_gate, const std::string& act_cand,                \
-      const std::string& act_cell, int d)                                      \
-      : LSTMKernel<float>() {                                                  \
-    auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr<AVXAct> { \
-      if (type == "sigmoid") {                                                 \
-        return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid, isa>());       \
-      } else if (type == "relu") {                                             \
-        return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu, isa>());          \
-      } else if (type == "tanh") {                                             \
-        return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh, isa>());          \
-      } else if (type == "identity" || type == "") {                           \
-        return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity, isa>());      \
-      }                                                                        \
-      PADDLE_THROW("Not support type: %s", type);                              \
-    };                                                                         \
-    avx_act_gate_ = GetAVXAct(act_gate);                                       \
-    avx_act_cand_ = GetAVXAct(act_cand);                                       \
-    avx_act_cell_ = GetAVXAct(act_cell);                                       \
-  }                                                                            \
-  template <>                                                                  \
-  void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt(                          \
-      float* gates, const float* ct_1, float* ct, float* ht,                   \
-      const float* wp_data, float* checked) const {                            \
-    /* gates: W_ch, W_ih, W_fh, W_oh */                                        \
-    __m256 c, i, f, o;                                                         \
-    c = _mm256_loadu_ps(gates);                                                \
-    i = _mm256_loadu_ps(gates + 8);                                            \
-    f = _mm256_loadu_ps(gates + 16);                                           \
-    o = _mm256_loadu_ps(gates + 24);                                           \
-    /* C_t = C_t-1 * fgated + cand_gated * igated*/                            \
-    c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i));   \
-    i = _mm256_loadu_ps(ct_1);                                                 \
-    f = _mm256_mul_ps(i, avx_act_gate_->Compute(f));                           \
-    f = _mm256_add_ps(c, f);                                                   \
-    _mm256_storeu_ps(ct, f);                                                   \
-    /* H_t = act_cell(C_t) * ogated */                                         \
-    o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o));   \
-    _mm256_storeu_ps(ht, o);                                                   \
-  }                                                                            \
-  template <>                                                                  \
-  void LSTMKernelImpl<float, isa, kEQ8>::ComputeC1H1(                          \
-      float* gates, float* ct, float* ht, const float* wp_data) const {        \
-    __m256 c, i, o;                                                            \
-    c = _mm256_loadu_ps(gates);                                                \
-    i = _mm256_loadu_ps(gates + 8);                                            \
-    o = _mm256_loadu_ps(gates + 24);                                           \
-    /* C_t = igated * cgated*/                                                 \
-    c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c));   \
-    _mm256_storeu_ps(ct, c);                                                   \
-    /* H_t = act_cell(C_t) * ogated */                                         \
-    o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o));   \
-    _mm256_storeu_ps(ht, o);                                                   \
+#define INTRI8_FLOAT(isa)                                                    \
+  template <>                                                                \
+  LSTMKernelImpl<float, isa, kEQ8>::LSTMKernelImpl(                          \
+      const std::string& act_gate, const std::string& act_cand,              \
+      const std::string& act_cell, int d)                                    \
+      : LSTMKernel<float>() {                                                \
+    avx_act_gate_ = GetAVXAct<isa>(act_gate);                                \
+    avx_act_cand_ = GetAVXAct<isa>(act_cand);                                \
+    avx_act_cell_ = GetAVXAct<isa>(act_cell);                                \
+  }                                                                          \
+  template <>                                                                \
+  void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt(                        \
+      float* gates, const float* ct_1, float* ct, float* ht,                 \
+      const float* wp_data, float* checked) const {                          \
+    /* gates: W_ch, W_ih, W_fh, W_oh */                                      \
+    __m256 c, i, f, o;                                                       \
+    c = _mm256_loadu_ps(gates);                                              \
+    i = _mm256_loadu_ps(gates + 8);                                          \
+    f = _mm256_loadu_ps(gates + 16);                                         \
+    o = _mm256_loadu_ps(gates + 24);                                         \
+    /* C_t = C_t-1 * fgated + cand_gated * igated*/                          \
+    c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
+    i = _mm256_loadu_ps(ct_1);                                               \
+    f = _mm256_mul_ps(i, avx_act_gate_->Compute(f));                         \
+    f = _mm256_add_ps(c, f);                                                 \
+    _mm256_storeu_ps(ct, f);                                                 \
+    /* H_t = act_cell(C_t) * ogated */                                       \
+    o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
+    _mm256_storeu_ps(ht, o);                                                 \
+  }                                                                          \
+  template <>                                                                \
+  void LSTMKernelImpl<float, isa, kEQ8>::ComputeC1H1(                        \
+      float* gates, float* ct, float* ht, const float* wp_data) const {      \
+    __m256 c, i, o;                                                          \
+    c = _mm256_loadu_ps(gates);                                              \
+    i = _mm256_loadu_ps(gates + 8);                                          \
+    o = _mm256_loadu_ps(gates + 24);                                         \
+    /* C_t = igated * cgated*/                                               \
+    c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \
+    _mm256_storeu_ps(ct, c);                                                 \
+    /* H_t = act_cell(C_t) * ogated */                                       \
+    o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \
+    _mm256_storeu_ps(ht, o);                                                 \
   }
 
 // TODO(TJ): optimize keq16
@@ -354,6 +359,126 @@ REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM,
 #undef JITKERNEL_DECLARE_LSTM
 #undef JITKERNEL_KEY_LSTM
 #undef JITKERNEL_NEW_LSTM_IMPL
+
+/* GRU JitKernel */
+template <typename T, jit::cpu_isa_t isa, jit_block>
+class GRUKernelImpl : public GRUKernel<T> {
+ public:
+  explicit GRUKernelImpl(const std::string& act_gate,
+                         const std::string& act_state, int d)
+      : GRUKernel<T>() {
+    d_ = d;
+    d2_ = d * 2;
+    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
+    act_gate_d_ = GetActKernel<T>(act_gate, d);
+    act_state_d_ = GetActKernel<T>(act_state, d);
+    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
+  }
+
+  void ComputeH1(T* gates, T* ht) const override {
+    act_gate_d_->Compute(gates, gates);
+    act_state_d_->Compute(gates + d2_, gates + d2_);
+    vmul_d_->Compute(gates, gates + d2_, ht);
+  }
+
+  void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override {
+    // W: {W_update, W_reset; W_state}
+    act_gate_d2_->Compute(gates, gates);
+    vmul_d_->Compute(ht_1, gates + d_, ht);
+  }
+
+  void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override {
+    T* y = gates + d2_;
+    act_state_d_->Compute(y, y);
+    // out = zt*ht~ + (1-zt)*ht_1
+    for (int i = 0; i < d_; ++i) {
+      ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
+    }
+  }
+
+ private:
+  int d_, d2_;
+  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_state_d_;
+  std::shared_ptr<const VMulKernel<T>> vmul_d_;
+#ifdef __AVX__
+  std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_state_;
+#endif
+};
+
+#define INTRI8_FLOAT(isa)                                                     \
+  template <>                                                                 \
+  GRUKernelImpl<float, isa, kEQ8>::GRUKernelImpl(                             \
+      const std::string& act_gate, const std::string& act_state, int d)       \
+      : GRUKernel<float>() {                                                  \
+    avx_act_gate_ = GetAVXAct<isa>(act_gate);                                 \
+    avx_act_state_ = GetAVXAct<isa>(act_state);                               \
+  }                                                                           \
+  template <>                                                                 \
+  void GRUKernelImpl<float, isa, kEQ8>::ComputeH1(float* gates, float* ht)    \
+      const {                                                                 \
+    __m256 u, s;                                                              \
+    /* W: {W_update, W_reset; W_state} */                                     \
+    u = _mm256_loadu_ps(gates);                                               \
+    s = _mm256_loadu_ps(gates + 16);                                          \
+    s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \
+    _mm256_storeu_ps(ht, s);                                                  \
+  }                                                                           \
+  template <>                                                                 \
+  void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart1(                       \
+      float* gates, const float* ht_1, float* ht) const {                     \
+    /* not exactly equal the any implementation */                            \
+    __m256 r, ht0;                                                            \
+    r = _mm256_loadu_ps(gates + 8);                                           \
+    ht0 = _mm256_loadu_ps(ht_1);                                              \
+    r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0);                        \
+    _mm256_storeu_ps(ht, r);                                                  \
+  }                                                                           \
+  template <>                                                                 \
+  void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart2(                       \
+      float* gates, const float* ht_1, float* ht) const {                     \
+    /* not exactly equal the any implementation */                            \
+    __m256 u, s, ht0;                                                         \
+    u = _mm256_loadu_ps(gates);                                               \
+    s = _mm256_loadu_ps(gates + 16);                                          \
+    ht0 = _mm256_loadu_ps(ht_1);                                              \
+    u = avx_act_gate_->Compute(u);                                            \
+    s = _mm256_mul_ps(u, avx_act_state_->Compute(s));                         \
+    u = _mm256_sub_ps(_mm256_set1_ps(1.f), u);                                \
+    u = _mm256_mul_ps(u, ht0);                                                \
+    u = _mm256_add_ps(s, u);                                                  \
+    _mm256_storeu_ps(ht, u);                                                  \
+  }
+
+#ifdef __AVX__
+INTRI8_FLOAT(jit::avx);
+#endif
+#ifdef __AVX2__
+INTRI8_FLOAT(jit::avx2);
+#endif
+#ifdef __AVX512F__
+INTRI8_FLOAT(jit::avx512f);
+#endif
+
+#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype)                       \
+  template <>                                                             \
+  std::shared_ptr<const GRUKernel<ker_dtype>> KernelPool::Get<            \
+      GRUKernel<ker_dtype>, const std::string&, const std::string&, int>( \
+      const std::string& act_gate, const std::string& act_state, int d)
+
+#define JITKERNEL_KEY_GRU(ker_key, dtype_key) \
+  #ker_key #dtype_key + std::to_string(d) + act_gate + act_state
+
+#define JITKERNEL_NEW_GRU_IMPL(ker, dtype, isa, k) \
+  p = std::dynamic_pointer_cast<ker<dtype>>(       \
+      std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_state, d));
+
+REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DECLARE_GRU,
+                        JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL);
+
+#undef INTRI8_FLOAT
+#undef JITKERNEL_NEW_GRU_IMPL
+#undef JITKERNEL_KEY_GRU
+#undef JITKERNEL_DECLARE_GRU
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index b871851798..8df43bb616 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -31,7 +31,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* output) {
+                  bool exclusive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -68,7 +68,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                 pool_process.compute(input_data[h * input_width + w], &ele);
               }
             }
-            int pool_size = (hend - hstart) * (wend - wstart);
+            int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+                                      : ksize_height * ksize_width;
             pool_process.finalize(static_cast<T>(pool_size), &ele);
             output_data[ph * output_width + pw] = ele;
           }
@@ -93,7 +94,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       const framework::Tensor& output, const framework::Tensor& output_grad,
       const std::vector<int>& ksize, const std::vector<int>& strides,
       const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      framework::Tensor* input_grad) {
+      bool exclusive, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -124,7 +125,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             int wstart = pw * stride_width - padding_width;
             int wend = std::min(wstart + ksize_width, input_width);
             wstart = std::max(wstart, 0);
-            int pool_size = (hend - hstart) * (wend - wstart);
+            int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+                                      : ksize_height * ksize_width;
             float scale = 1.0 / pool_size;
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
@@ -249,7 +251,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* output) {
+                  bool exclusive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -300,7 +302,9 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                 }
               }
               int pool_size =
-                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  exclusive
+                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                      : ksize_depth * ksize_height * ksize_width;
               pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[output_idx] = ele;
             }
@@ -326,7 +330,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
       const framework::Tensor& output, const framework::Tensor& output_grad,
       const std::vector<int>& ksize, const std::vector<int>& strides,
       const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      framework::Tensor* input_grad) {
+      bool exclusive, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -369,7 +373,9 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
               wstart = std::max(wstart, 0);
 
               int pool_size =
-                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  exclusive
+                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                      : ksize_depth * ksize_height * ksize_width;
               float scale = 1.0 / pool_size;
               for (int d = dstart; d < dend; ++d) {
                 for (int h = hstart; h < hend; ++h) {
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index b1c76350d1..a689eb4224 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -29,7 +29,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
                              const int ksize_width, const int stride_height,
                              const int stride_width, const int padding_height,
                              const int padding_width, PoolProcess pool_process,
-                             T* output_data) {
+                             bool exclusive, T* output_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -52,7 +52,8 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
         pool_process.compute(input_data[h * input_width + w], &ele);
       }
     }
-    int pool_size = (hend - hstart) * (wend - wstart);
+    int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+                              : ksize_height * ksize_width;
     pool_process.finalize(static_cast<T>(pool_size), &ele);
     output_data[index] = ele;
   }
@@ -65,7 +66,7 @@ __global__ void KernelPool2DGrad(
     const int input_width, const int output_height, const int output_width,
     const int ksize_height, const int ksize_width, const int stride_height,
     const int stride_width, const int padding_height, const int padding_width,
-    PoolProcess pool_process, T* input_grad) {
+    PoolProcess pool_process, bool exclusive, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int offsetW = index % input_width + padding_width;
@@ -95,7 +96,8 @@ __global__ void KernelPool2DGrad(
         int wend = min(wstart + ksize_width, input_width);
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
-        int pool_size = (hend - hstart) * (wend - wstart);
+        int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+                                  : ksize_height * ksize_width;
         int output_sub_idx = ph * output_width + pw;
         pool_process.compute(input, output_data[output_sub_idx],
                              output_grad[output_sub_idx],
@@ -163,7 +165,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* output) {
+                  bool exclusive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -189,7 +191,8 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
     KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, input_channels, input_height, input_width,
         output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, pool_process, output_data);
+        stride_width, padding_height, padding_width, pool_process, exclusive,
+        output_data);
   }
 };
 
@@ -208,7 +211,7 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* input_grad) {
+                  bool exclusive, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -236,7 +239,7 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         nthreads, input_data, output_data, output_grad_data, input_channels,
         input_height, input_width, output_height, output_width, ksize_height,
         ksize_width, stride_height, stride_width, padding_height, padding_width,
-        pool_process, input_grad_data);
+        pool_process, exclusive, input_grad_data);
   }
 };
 
@@ -313,16 +316,14 @@ template class Pool2dGradFunctor<platform::CUDADeviceContext,
                                  double>;
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(const int nthreads, const T* input_data,
-                             const int channels, const int input_depth,
-                             const int input_height, const int input_width,
-                             const int output_depth, const int output_height,
-                             const int output_width, const int ksize_depth,
-                             const int ksize_height, const int ksize_width,
-                             const int stride_depth, const int stride_height,
-                             const int stride_width, const int padding_depth,
-                             const int padding_height, const int padding_width,
-                             PoolProcess pool_process, T* output_data) {
+__global__ void KernelPool3D(
+    const int nthreads, const T* input_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    PoolProcess pool_process, bool exclusive, T* output_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -351,7 +352,9 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data,
         }
       }
     }
-    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    int pool_size = exclusive
+                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
     pool_process.finalize(static_cast<T>(pool_size), &ele);
     output_data[index] = ele;
   }
@@ -366,7 +369,7 @@ __global__ void KernelPool3DGrad(
     const int ksize_height, const int ksize_width, const int stride_depth,
     const int stride_height, const int stride_width, const int padding_depth,
     const int padding_height, const int padding_width, PoolProcess pool_process,
-    T* input_grad) {
+    bool exclusive, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int offsetW = index % input_width + padding_width;
@@ -409,7 +412,9 @@ __global__ void KernelPool3DGrad(
           dstart = max(dstart, 0);
           hstart = max(hstart, 0);
           wstart = max(wstart, 0);
-          int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          int pool_size =
+              exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart)
+                        : ksize_depth * ksize_height * ksize_width;
           int output_sub_idx = (pd * output_height + ph) * output_width + pw;
           pool_process.compute(input, output_data[output_sub_idx],
                                output_grad[output_sub_idx],
@@ -484,7 +489,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const framework::Tensor& input, const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* output) {
+                  bool exclusive, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -517,7 +522,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         nthreads, input_data, input_channels, input_depth, input_height,
         input_width, output_depth, output_height, output_width, ksize_depth,
         ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, pool_process,
+        padding_depth, padding_height, padding_width, pool_process, exclusive,
         output_data);
   }
 };
@@ -537,7 +542,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_process,
-                  framework::Tensor* input_grad) {
+                  bool exclusive, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -573,7 +578,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
         input_depth, input_height, input_width, output_depth, output_height,
         output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
         stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, input_grad_data);
+        padding_width, pool_process, exclusive, input_grad_data);
   }
 };
 
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 120f591980..0f64e321bf 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -89,7 +89,7 @@ class Pool2dFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  framework::Tensor* output);
+                  bool exclusive, framework::Tensor* output);
 };
 
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -101,7 +101,7 @@ class Pool2dGradFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  framework::Tensor* input_grad);
+                  bool exclusive, framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, class T>
@@ -123,7 +123,7 @@ class Pool3dFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  framework::Tensor* output);
+                  bool exclusive, framework::Tensor* output);
 };
 
 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -135,7 +135,7 @@ class Pool3dGradFunctor {
                   const std::vector<int>& ksize,
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, PoolProcess pool_compute,
-                  framework::Tensor* input_grad);
+                  bool exclusive, framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, class T>
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 08f57dd45a..7594674037 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <map>
 #include <set>
-#include <vector>
+#include <unordered_map>
 
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -230,8 +229,24 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 // add or mul.
 namespace scatter {
 
-size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
-  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
+                   size_t data_len, const T* in, T* out) {
+  blas->AXPY(data_len, 1., in, out);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
+                   size_t data_len, const T* in, T* out) {
+  for (int64_t i = 0; i < data_len; i++) {
+    out[i] += in[i];
+  }
 }
 
 template <typename T>
@@ -246,48 +261,84 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& input,
                   framework::SelectedRows* output) {
-    framework::SelectedRows& out = *output;
-    std::vector<int64_t> input_rows(input.rows());
+    std::vector<const framework::SelectedRows*> inputs;
+    inputs.push_back(&input);
+    (*this)(context, inputs, output);
+  }
 
-    std::map<int64_t, std::vector<int64_t>> merge_row_map;
-    for (size_t i = 0; i < input_rows.size(); ++i) {
-      merge_row_map[input_rows[i]].push_back(i);
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::vector<const framework::SelectedRows*>& inputs,
+                  framework::SelectedRows* output) {
+    if (inputs.size() == 0) {
+      VLOG(3) << "no input! return";
+      return;
     }
-
-    std::vector<int64_t> merge_rows(merge_row_map.size());
-    size_t idx = 0;
-    int64_t input_width = input.value().dims()[1];
-    out.set_height(input.height());
-
-    T* out_data = out.mutable_value()->mutable_data<T>(
+    const framework::SelectedRows* has_value_input = nullptr;
+    for (auto* in : inputs) {
+      if (in->rows().size() > 0) {
+        has_value_input = in;
+        break;
+      }
+    }
+    if (has_value_input == nullptr) {
+      VLOG(3) << "no input has value! just return" << std::endl;
+      return;
+    }
+    auto input_width = has_value_input->value().dims()[1];
+    auto input_height = has_value_input->height();
+    framework::SelectedRows& out = *output;
+    std::set<int64_t> merged_row_set;
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
+                        "all input should have same "
+                        "dimension except for the first one");
+      PADDLE_ENFORCE_EQ(input_height, input->height(),
+                        "all input should have same height");
+      merged_row_set.insert(input->rows().begin(), input->rows().end());
+    }
+    std::vector<int64_t> merge_rows(merged_row_set.begin(),
+                                    merged_row_set.end());
+    std::unordered_map<int64_t, size_t> rows_to_id;
+    for (size_t i = 0; i < merge_rows.size(); ++i) {
+      rows_to_id[merge_rows[i]] = i;
+    }
+    out.set_rows(merge_rows);
+    out.set_height(input_height);
+    out.mutable_value()->mutable_data<T>(
         framework::make_ddim(
             {static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
-    const T* in_data = input.value().data<T>();
-
-    for (auto& row_pair : merge_row_map) {
-      auto* out_ptr = out_data + idx * input_width;
-      auto& rows = row_pair.second;
-      merge_rows[idx] = row_pair.first;
-      ++idx;
-      // rows.size() is always larger than 0
-      std::memcpy(out_ptr, in_data + rows[0] * input_width,
-                  sizeof(T) * input_width);
-
-      for (size_t i = 1; i < rows.size(); ++i) {
-        auto* in_ptr = in_data + rows[i] * input_width;
-        for (int64_t j = 0; j < input_width; ++j) {
-          out_ptr[j] += in_ptr[j];
-        }
+
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      auto* input_data = input->value().data<T>();
+      auto& input_rows = input->rows();
+
+      for (size_t i = 0; i < input_rows.size(); i++) {
+        size_t out_i = rows_to_id[input_rows[i]];
+        elementwise_add_to<platform::CPUDeviceContext, T>(
+            context, &blas, static_cast<size_t>(input_width),
+            &input_data[i * input_width], &out_data[out_i * input_width]);
       }
     }
-
-    out.set_rows(merge_rows);
   }
 };
 
 template struct MergeAdd<platform::CPUDeviceContext, int>;
 template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
+template struct MergeAdd<platform::CPUDeviceContext, float>;
+template struct MergeAdd<platform::CPUDeviceContext, double>;
 
 template <typename T>
 struct UpdateToTensor<platform::CPUDeviceContext, T> {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index ba8eccf820..10f39822b9 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -267,10 +267,15 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& input,
                   framework::SelectedRows* output) {
-    framework::SelectedRows& out = *output;
     framework::Vector<int64_t> input_rows(input.rows());
+    if (input_rows.size() == 0) {
+      return;
+    }
+
+    framework::SelectedRows& out = *output;
     std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
-    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+    std::vector<int64_t> merge_rows_cpu(row_set.begin(), row_set.end());
+    framework::Vector<int64_t> merge_rows(merge_rows_cpu);
 
     auto input_width = input.value().dims()[1];
 
@@ -296,6 +301,73 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
         out.mutable_rows()->CUDAMutableData(context.GetPlace()),
         out.rows().size(), input_width);
   }
+
+  void operator()(const platform::CUDADeviceContext& context,
+                  const std::vector<const framework::SelectedRows*>& inputs,
+                  framework::SelectedRows* output) {
+    if (inputs.size() == 0) {
+      VLOG(3) << "no input! return";
+      return;
+    }
+    const framework::SelectedRows* has_value_input = nullptr;
+    for (auto* in : inputs) {
+      if (in->rows().size() > 0) {
+        has_value_input = in;
+        break;
+      }
+    }
+    if (has_value_input == nullptr) {
+      VLOG(3) << "no input has value! just return" << std::endl;
+      return;
+    }
+    auto input_width = has_value_input->value().dims()[1];
+    auto input_height = has_value_input->height();
+    framework::SelectedRows& out = *output;
+    std::set<int64_t> merged_row_set;
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
+                        "all input should have same "
+                        "dimension except for the first one");
+      PADDLE_ENFORCE_EQ(input_height, input->height(),
+                        "all input should have same height");
+      merged_row_set.insert(input->rows().begin(), input->rows().end());
+    }
+    std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
+                                        merged_row_set.end());
+    framework::Vector<int64_t> merge_rows(merge_rows_cpu);
+
+    out.set_rows(merge_rows);
+    out.set_height(input_height);
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      auto* input_data = input->value().data<T>();
+      auto& input_rows = input->rows();
+      dim3 grid1(input_rows.size(), 1);
+
+      MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
+          input_data, input_rows.CUDAData(context.GetPlace()), out_data,
+          out.mutable_rows()->CUDAMutableData(context.GetPlace()),
+          out.rows().size(), input_width);
+    }
+  }
 };
 
 template struct MergeAdd<platform::CUDADeviceContext, float>;
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index 900be86f91..521c53dd0d 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -83,104 +83,9 @@ struct MergeAdd {
   void operator()(const DeviceContext& context,
                   const framework::SelectedRows& input,
                   framework::SelectedRows* output);
-};
-
-template <>
-struct MergeAdd<platform::CPUDeviceContext, float> {
-  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                                     const framework::SelectedRows& input) {
-    framework::SelectedRows out;
-    (*this)(context, input, &out);
-    return out;
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output) {
-    framework::SelectedRows& out = *output;
-    std::vector<int64_t> input_rows(input.rows());
-
-    std::map<int64_t, std::vector<int64_t>> merge_row_map;
-    for (size_t i = 0; i < input_rows.size(); ++i) {
-      merge_row_map[input_rows[i]].push_back(i);
-    }
-
-    std::vector<int64_t> merge_rows(merge_row_map.size());
-    size_t idx = 0;
-    int64_t input_width = input.value().dims()[1];
-    out.set_height(input.height());
-
-    auto* out_data = out.mutable_value()->mutable_data<float>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
-    auto* in_data = input.value().data<float>();
-
-    auto blas = GetBlas<platform::CPUDeviceContext, float>(context);
-    for (auto& row_pair : merge_row_map) {
-      auto* out_ptr = out_data + idx * input_width;
-      auto& rows = row_pair.second;
-      merge_rows[idx] = row_pair.first;
-      ++idx;
-      // rows.size() is always larger than 0
-      blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr);
-
-      for (size_t i = 1; i < rows.size(); ++i) {
-        blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr);
-      }
-    }
-
-    out.set_rows(merge_rows);
-  }
-};
-
-template <>
-struct MergeAdd<platform::CPUDeviceContext, double> {
-  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                                     const framework::SelectedRows& input) {
-    framework::SelectedRows out;
-    (*this)(context, input, &out);
-    return out;
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::SelectedRows& input,
-                  framework::SelectedRows* output) {
-    framework::SelectedRows& out = *output;
-    std::vector<int64_t> input_rows(input.rows());
-
-    std::map<int64_t, std::vector<int64_t>> merge_row_map;
-    for (size_t i = 0; i < input_rows.size(); ++i) {
-      merge_row_map[input_rows[i]].push_back(i);
-    }
-
-    std::vector<int64_t> merge_rows(merge_row_map.size());
-    size_t idx = 0;
-    int64_t input_width = input.value().dims()[1];
-    out.set_height(input.height());
-
-    auto* out_data = out.mutable_value()->mutable_data<double>(
-        framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), input_width}),
-        context.GetPlace());
-    auto* in_data = input.value().data<double>();
-
-    auto blas = GetBlas<platform::CPUDeviceContext, double>(context);
-    for (auto& row_pair : merge_row_map) {
-      auto* out_ptr = out_data + idx * input_width;
-      auto& rows = row_pair.second;
-      merge_rows[idx] = row_pair.first;
-      ++idx;
-      // rows.size() is always larger than 0
-      blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr);
-
-      for (size_t i = 1; i < rows.size(); ++i) {
-        blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr);
-      }
-    }
-
-    out.set_rows(merge_rows);
-  }
+  void operator()(const DeviceContext& context,
+                  const std::vector<const framework::SelectedRows*>& inputs,
+                  framework::SelectedRows* output);
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index 8355893560..f15b37a1e3 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -302,6 +302,64 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
   EXPECT_EQ(out_data[1 * row_numel], 2);
   EXPECT_EQ(out_data[2 * row_numel], 1);
 }
+
+TEST(selected_rows_functor, cpu_merge_add_multi) {
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
+      set_const;
+
+  int64_t height = 10;
+  int64_t row_numel = 8;
+
+  std::vector<int64_t> rows1{5, 2, 5, 3, 5};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      cpu_place);
+  set_const(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{2, 5, 3, 5, 3};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      cpu_place);
+  set_const(ctx, in2_value, 1.0);
+
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
+  output->set_height(height);
+  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
+                                             float>
+      merge_add_functor;
+
+  std::vector<const paddle::framework::SelectedRows*> inputs;
+  inputs.push_back(selected_rows1.get());
+  inputs.push_back(selected_rows2.get());
+  merge_add_functor(ctx, inputs, output.get());
+
+  EXPECT_EQ(output->height(), height);
+  EXPECT_EQ(output->value().dims(),
+            paddle::framework::make_ddim({3, row_numel}));
+
+  std::vector<int64_t> ret_rows{2, 3, 5};
+  EXPECT_EQ(output->rows(), ret_rows);
+
+  auto* out_data = output->value().data<float>();
+  for (size_t i = 0; i < ret_rows.size(); ++i) {
+    for (size_t j = 0; j < row_numel; ++j) {
+      EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
+    }
+  }
+}
+
 TEST(selected_rows_functor, cpu_sum_to) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
@@ -318,6 +376,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
       paddle::framework::make_ddim(
           {static_cast<int64_t>(rows1.size()), row_numel}),
       cpu_place);
+
   functor(ctx, in1_value, 1.0);
   std::vector<int64_t> rows2{0, 5, 7, 9};
   std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
@@ -327,6 +386,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
       paddle::framework::make_ddim(
           {static_cast<int64_t>(rows2.size()), row_numel}),
       cpu_place);
+
   functor(ctx, in2_value, 2.0);
   std::unique_ptr<paddle::framework::SelectedRows> output{
       new paddle::framework::SelectedRows()};
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu
index 5fc50aba25..17af3e3999 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
@@ -241,3 +241,67 @@ TEST(selected_rows_functor, gpu_add_to) {
   // row9: 2.0 + 3.0
   EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0);
 }
+
+TEST(selected_rows_functor, gpu_merge_add) {
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDADeviceContext& ctx =
+      *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
+          paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
+  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
+                                       float>
+      set_const;
+
+  int64_t height = 10;
+  int64_t row_numel = 8;
+
+  std::vector<int64_t> rows1{5, 2, 5, 3, 5};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      gpu_place);
+  set_const(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{2, 5, 3, 5, 3};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      gpu_place);
+  set_const(ctx, in2_value, 1.0);
+
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
+  output->set_height(height);
+  paddle::operators::math::scatter::MergeAdd<
+      paddle::platform::CUDADeviceContext, float>
+      merge_add_functor;
+
+  std::vector<const paddle::framework::SelectedRows*> inputs;
+  inputs.push_back(selected_rows1.get());
+  inputs.push_back(selected_rows2.get());
+  merge_add_functor(ctx, inputs, output.get());
+
+  paddle::framework::Tensor output_cpu;
+  paddle::framework::TensorCopy(output->value(), cpu_place, ctx, &output_cpu);
+  ctx.Wait();
+
+  EXPECT_EQ(output->height(), height);
+  EXPECT_EQ(output->value().dims(),
+            paddle::framework::make_ddim({3, row_numel}));
+
+  std::vector<int64_t> ret_rows{2, 3, 5};
+  EXPECT_EQ(output->rows(), ret_rows);
+
+  auto* out_data = output_cpu.data<float>();
+  for (size_t i = 0; i < ret_rows.size(); ++i) {
+    for (size_t j = 0; j < row_numel; ++j) {
+      EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]);
+    }
+  }
+}
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 235b5405fb..6d491dbf1e 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename T>
+template <typename T, bool is_test>
 class MaxSeqPoolFunctor {
  public:
   void operator()(const platform::CPUDeviceContext& context,
@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor {
     }
   }
 };
+// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
+// index buffer
+template <typename T>
+class MaxSeqPoolFunctor<T, true> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
 
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim],
+                  dim * sizeof(T));
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+          }
+        }
+      }
+    }
+  }
+};
 template <typename T>
 class MaxSeqPoolGradFunctor {
  public:
@@ -157,17 +191,47 @@ class FirstSeqPoolFunctor {
   }
 };
 
+template <typename T>
+class SumSeqPoolGradFunctor {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  framework::LoDTensor* in_grad) {
+    auto lod = in_grad->lod()[0];
+    int64_t out_w = out_grad.numel() / out_grad.dims()[0];
+    int64_t in_w = in_grad->numel() / in_grad->dims()[0];
+    PADDLE_ENFORCE(in_w == out_w);
+    const T* out_g_data = out_grad.data<T>();
+    T* in_g_data = in_grad->mutable_data<T>(context.GetPlace());
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t in_offset = lod[i] * in_w;
+      const T* out_pos = out_g_data + i * out_w;
+      T* in_pos = in_g_data + in_offset;
+      for (int r = 0; r != h; ++r) {
+        blas.VCOPY(in_w, out_pos, in_pos + r * in_w);
+      }
+    }
+  }
+};
+
 template <typename T>
 class SequencePoolFunctor<platform::CPUDeviceContext, T> {
  public:
   /* max pool has index output */
   void operator()(const platform::CPUDeviceContext& context,
                   const std::string pooltype, const framework::LoDTensor& input,
-                  framework::Tensor* output,
+                  framework::Tensor* output, bool is_test,
                   framework::Tensor* index = nullptr) {
     if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<T> max_pool;
-      max_pool(context, input, output, index);
+      if (is_test) {
+        math::MaxSeqPoolFunctor<T, true> max_pool;
+        max_pool(context, input, output, index);
+      } else {
+        math::MaxSeqPoolFunctor<T, false> max_pool;
+        max_pool(context, input, output, index);
+      }
       return;
     }
     if (pooltype == "LAST") {
@@ -175,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       last_pool(context, input, output);
       return;
     }
+
     if (pooltype == "FIRST") {
       math::FirstSeqPoolFunctor<T> first_pool;
       first_pool(context, input, output);
@@ -231,9 +296,15 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
       math::SetConstant<platform::CPUDeviceContext, T> functor;
       functor(context, in_grad, 0);
     }
+
+    if (pooltype == "SUM") {
+      math::SumSeqPoolGradFunctor<T> sum_pool_grad;
+      sum_pool_grad(context, out_grad, in_grad);
+      return;
+    }
+
     auto lod = in_grad->lod()[0];
     auto& place = *context.eigen_device();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
                                    static_cast<int>(lod[i + 1]));
@@ -247,12 +318,6 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
 
       if (pooltype == "AVERAGE") {
         in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
-      } else if (pooltype == "SUM") {
-        const T* out_g_data = out_g_t.data<T>();
-        T* in_g_data = in_g_t.mutable_data<T>(context.GetPlace());
-        for (int r = 0; r != h; ++r) {
-          blas.VCOPY(w, out_g_data, in_g_data + r * w);
-        }
       } else if (pooltype == "SQRT") {
         in_g_e.device(place) =
             (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index a92aef805a..0015fafbc8 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const std::string pooltype, const framework::LoDTensor& input,
-                  framework::Tensor* output,
+                  framework::Tensor* output, bool is_test,
                   framework::Tensor* index = nullptr) {
     auto& lod = input.lod()[0];
     const size_t item_dim = output->numel() / output->dims()[0];
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
index 8dcbee65d0..a1046ea216 100644
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -28,7 +28,7 @@ class SequencePoolFunctor {
   /* max pool has index output */
   void operator()(const DeviceContext& context, const std::string pooltype,
                   const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index = nullptr);
+                  bool is_test = false, framework::Tensor* index = nullptr);
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
new file mode 100644
index 0000000000..2bc008dd34
--- /dev/null
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/sequence_pooling.h"
+#include <gtest/gtest.h>
+#include <vector>
+
+template <typename DeviceContext, typename Place, typename T>
+void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
+  paddle::framework::LoDTensor cpu_out_grad;
+  paddle::framework::LoDTensor cpu_in_grad;
+  paddle::framework::LoDTensor out_grad;
+  paddle::framework::LoDTensor in_grad;
+  const size_t second_dim = 128u;
+
+  // construct out_grad's tensor in cpu
+  const size_t out_first_dim = lod[0].size() - 1;
+  auto out_dims = paddle::framework::make_ddim(
+      {static_cast<int64_t>(out_first_dim), static_cast<int64_t>(second_dim)});
+
+  cpu_out_grad.mutable_data<T>(out_dims, paddle::platform::CPUPlace());
+  for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) {
+    cpu_out_grad.data<T>()[i] = static_cast<T>(i);
+  }
+
+  // copy to dst out_grad
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_grad = cpu_out_grad;
+  } else {
+    TensorCopySync(cpu_out_grad, *place, &out_grad);
+  }
+
+  // construct in_grad
+  in_grad.set_lod(lod);
+  auto in_dims = paddle::framework::make_ddim(
+      {static_cast<int64_t>(lod[0].back()), static_cast<int64_t>(second_dim)});
+  in_grad.mutable_data<T>(in_dims, context->GetPlace());
+
+  // check tensor contruction result
+  PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
+  for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
+    PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
+  }
+
+  // call functor
+  paddle::operators::math::SequencePoolGradFunctor<DeviceContext, T>()(
+      *context, "SUM", out_grad, &in_grad);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_in_grad = in_grad;
+  } else {
+    TensorCopySync(in_grad, paddle::platform::CPUPlace(), &cpu_in_grad);
+    cpu_in_grad.set_lod(in_grad.lod());
+  }
+
+  EXPECT_EQ(in_grad.numel(), lod[0].back() * second_dim);
+  EXPECT_EQ(in_grad.lod(), lod);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    for (int64_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
+      int64_t begin = in_grad.lod()[0][i];
+      int64_t end = in_grad.lod()[0][i + 1];
+      paddle::framework::Tensor tmp = in_grad.Slice(begin, end);
+      for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
+        for (int64_t m = 0; m != second_dim; ++m) {
+          EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
+                    out_grad.data<T>()[m + i * second_dim]);
+        }
+      }
+    }
+  } else {
+    for (int64_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) {
+      int64_t begin = cpu_in_grad.lod()[0][i];
+      int64_t end = cpu_in_grad.lod()[0][i + 1];
+      paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end);
+      for (int64_t j = 0; j != tmp.numel() / second_dim; ++j) {
+        for (int64_t m = 0; m != second_dim; ++m) {
+          EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
+                    cpu_out_grad.data<T>()[m + i * second_dim]);
+        }
+      }
+    }
+  }
+
+  delete place;
+  delete context;
+}
+
+TEST(SequencePoolingGrad, CPU_SUM) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
+                         paddle::platform::CPUPlace, float>(lod1);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
+                         paddle::platform::CPUPlace, float>(lod2);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(SequencePoolingGrad, CUDA_SUM) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
+                         paddle::platform::CUDAPlace, float>(lod1);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
+                         paddle::platform::CUDAPlace, float>(lod2);
+}
+#endif
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 9e0bebd17c..19426b3c20 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -34,7 +34,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "(Tensor) The input of mean op");
-    AddOutput("Out", "(Tensor) The output of mean op").Reuse("X");
+    AddOutput("Out", "(Tensor) The output of mean op");
     AddComment(R"DOC(
 Mean Operator calculates the mean of all elements in X.
 
diff --git a/paddle/fluid/operators/merge_ids_op.cc b/paddle/fluid/operators/merge_ids_op.cc
index c6ec4ab047..6e0e136980 100644
--- a/paddle/fluid/operators/merge_ids_op.cc
+++ b/paddle/fluid/operators/merge_ids_op.cc
@@ -20,13 +20,16 @@ namespace operators {
 class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
-    AddInput(
-        "X",
-        "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the "
-        "size of embedding table")
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
+        .AsDuplicable();
+    AddInput("Rows", "(LoDTensor) the input ids with shape{row_size, 1}, ")
+        .AsDuplicable();
+    AddInput("X",
+             "(LoDTensors) multi input tensor with shape{Rows, N}, N is the "
+             "size of embedding table")
+        .AsDuplicable();
+    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.")
         .AsDuplicable();
-    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.");
 
     AddComment(R"DOC(
 Merge multi LoDTensor's into one according to Ids's shard num.
@@ -79,15 +82,19 @@ class MergeIdsOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids.");
-    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out.");
+    PADDLE_ENFORCE(ctx->HasInputs("Ids"),
+                   "MergeIdsOp must has multi input Ids.");
+    PADDLE_ENFORCE(ctx->HasInputs("Rows"),
+                   "MergeIdsOp must has multi input Rows.");
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has multi input X.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
+                   "MergeIdsOp must has multi output Out.");
 
     auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputDim("Ids");
+    auto ids_dims = ctx->GetInputsDim("Ids");
     if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+      PADDLE_ENFORCE_EQ(ids_dims[0].size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[0][1], 1);
     }
     auto x_var_type = ctx->GetInputsVarType("X");
     for (auto &var_type : x_var_type) {
diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h
index 83712a8519..fef9e023d0 100644
--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <tuple>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -30,59 +32,70 @@ class MergeIdsOpKernel : public framework::OpKernel<T> {
     if (!platform::is_cpu_place(place)) {
       PADDLE_THROW("MergeIds do not support GPU kernel");
     }
-    VLOG(3) << "run in MergeIdsOpKernel";
 
-    const auto *ids_var = ctx.InputVar("Ids");
-    PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
-                   "only support to merge Ids of LoDTensor");
+    const auto ids = ctx.MultiInput<framework::LoDTensor>("Ids");
+    const auto row_ids = ctx.MultiInput<framework::LoDTensor>("Rows");
+    const auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
+    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
 
-    const auto &ids_tensor = ids_var->Get<framework::LoDTensor>();
-    const auto &ids_dims = ids_tensor.dims();
-    const int64_t *ids = ids_tensor.data<int64_t>();
+    PADDLE_ENFORCE_EQ(row_ids.size(), x_tensors.size(),
+                      "the number of Rows and X should be the same");
+    PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
+                      "the number of Ids and Out should be the same");
 
-    auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
+    int row_ids_size = 0;
+    int row_size = 0;
+    int embedding_size = 0;
 
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    for (int i = 0; i < x_tensors.size(); ++i) {
+      const auto *x_tensor = x_tensors[i];
+      const auto *row_id = row_ids[i];
 
-    int batch_size = 0;
-    int embedding_size = 0;
-    for (auto &input : x_tensors) {
-      if (framework::product(input->dims()) != 0) {
-        if (embedding_size == 0) {
-          embedding_size = input->dims()[1];
-        }
-        PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1],
-                          "embedding size of all input should be the same");
-        batch_size += input->dims()[0];
+      if (embedding_size == 0) {
+        embedding_size = x_tensor->dims()[1];
       }
+      PADDLE_ENFORCE_EQ(embedding_size, x_tensor->dims()[1],
+                        "embedding size of all input should be the same");
+      row_size += x_tensor->dims()[0];
+      row_ids_size += row_id->dims()[0];
     }
+
     PADDLE_ENFORCE_EQ(
-        batch_size, ids_dims[0],
-        "the batch size of ids and merged embedding value should be the same");
+        row_size, row_ids_size,
+        "the merged X dim[0] and merged Rows dim[0] should be the same");
+
+    std::unordered_map<int64_t, std::tuple<int64_t, int64_t>>
+        selected_rows_idx_map;
+    for (int i = 0; i < x_tensors.size(); ++i) {
+      const auto *row_id = row_ids[i];
+
+      for (int j = 0; j < row_id->numel(); ++j) {
+        int64_t key = row_id->data<int64_t>()[j];
+        std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
+        selected_rows_idx_map.insert(std::make_pair(key, val));
+      }
+    }
+    PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(),
+                      "the rows and tensor map size should be the same");
+
+    for (int i = 0; i < outs.size(); ++i) {
+      auto *out_ids = ids[i];
+      auto *out = outs[i];
 
-    const size_t shard_num = x_tensors.size();
+      out->set_lod(out_ids->lod());
 
-    if (shard_num == 1) {
-      VLOG(3) << "only one shard, we can copy the data directly";
-      TensorCopy(*x_tensors[0], place, out);
-    } else {
-      std::vector<int> in_indexs(shard_num, 0);
+      int nums = static_cast<int>(out_ids->dims()[0]);
       auto *out_data = out->mutable_data<T>(
-          framework::make_ddim({batch_size, embedding_size}), place);
-      // copy data from ins[shard_num] to out.
-      for (int i = 0; i < ids_dims[0]; ++i) {
-        int64_t id = ids[i];
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        int index = in_indexs[shard_id];
-        memcpy(out_data + embedding_size * i,
-               x_tensors[shard_id]->data<T>() + index * embedding_size,
+          framework::make_ddim({nums, embedding_size}), place);
+      for (int j = 0; j < nums; ++j) {
+        int id = out_ids->data<int64_t>()[j];
+        auto row_tuple = selected_rows_idx_map[id];
+        int64_t row_idx = std::get<1>(row_tuple);
+        const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
+
+        memcpy(out_data + embedding_size * j,
+               x_tensor->data<T>() + row_idx * embedding_size,
                sizeof(T) * embedding_size);
-        in_indexs[shard_id] += 1;
-      }
-
-      for (size_t i = 0; i < shard_num; ++i) {
-        PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0],
-                          "after merge, all data in x_tensor should be used");
       }
     }
   }
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index 12b916fceb..7f0b51580a 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -19,54 +19,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-class MomentumOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(param) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(grad) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
-                   "Input(velocity) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of Momentum should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
-                   "Output(VelocityOut) of Momentum should not be null.");
-
-    auto param_dim = ctx->GetInputDim("Param");
-    if (ctx->GetInputsVarType("Grad")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          param_dim, ctx->GetInputDim("Grad"),
-          "Param and Grad input of MomentumOp should have the same dimension.");
-      PADDLE_ENFORCE_EQ(
-          param_dim, ctx->GetInputDim("Velocity"),
-          "Param and Velocity of MomentumOp should have the same dimension.");
-    }
-    PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
-                      "Learning_rate should be a scalar");
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("VelocityOut", param_dim);
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
 class MomentumOpInferVarType : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDesc& op_desc,
diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h
index 6b4d00f56c..71f079e4d9 100644
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -28,6 +28,54 @@ using framework::SelectedRows;
 struct NoNesterov;
 struct UseNesterov;
 
+class MomentumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(param) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(grad) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
+                   "Input(velocity) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of Momentum should not be null.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("Param").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The input var's type should be LoDTensor, but the received is %s",
+        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
+                   "Output(VelocityOut) of Momentum should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dim, ctx->GetInputDim("Grad"),
+          "Param and Grad input of MomentumOp should have the same dimension.");
+      PADDLE_ENFORCE_EQ(
+          param_dim, ctx->GetInputDim("Velocity"),
+          "Param and Velocity of MomentumOp should have the same dimension.");
+    }
+    PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
+                      "Learning_rate should be a scalar");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("VelocityOut", param_dim);
+  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
 template <typename T>
 class CPUDenseMomentumFunctor {
  private:
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 31f083565f..1f090dc3d5 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -41,6 +41,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
     T *output_data = output->mutable_data<T>(ctx.GetPlace());
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    bool exclusive = ctx.Attr<bool>("exclusive");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -72,7 +73,8 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
     if (pooling_type == "max") {
       pooling_mode = PoolingMode::kMaximum;
     } else {
-      pooling_mode = PoolingMode::kAverage;
+      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                               : PoolingMode::kAverageInclusive;
     }
 
     cudnnPoolingDescriptor_t cudnn_pool_desc =
@@ -101,6 +103,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
     Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    bool exclusive = ctx.Attr<bool>("exclusive");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -141,7 +144,8 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
         pooling_mode = PoolingMode::kMaximum;
       }
     } else {
-      pooling_mode = PoolingMode::kAverage;
+      pooling_mode = exclusive ? PoolingMode::kAverageExclusive
+                               : PoolingMode::kAverageInclusive;
     }
 
     cudnnPoolingDescriptor_t cudnn_pool_desc =
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index f8ad63690e..484cb65746 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -151,8 +151,7 @@ void Pool2dOpMaker::Make() {
             "The format of output tensor is also NCHW, "
             "where N is batch size, C is the number of channels, "
             "H is the height of the feature, "
-            "and W is the width of the feature.")
-      .Reuse("X");
+            "and W is the width of the feature.");
 
   AddAttr<std::string>("pooling_type",
                        "(string), pooling type, can be \"max\" for max-pooling "
@@ -181,6 +180,12 @@ void Pool2dOpMaker::Make() {
       "operator."
       "If global_pooling = true, paddings and ksize will be ignored.")
       .SetDefault({0, 0});
+  AddAttr<bool>(
+      "exclusive",
+      "(bool, default True) When true, will exclude the zero-padding in the "
+      "averaging calculating, otherwise, include the zero-padding. Note, it "
+      "is only used when pooling_type is avg. The defalut is True.")
+      .SetDefault(true);
   AddAttr<bool>(
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
@@ -237,6 +242,23 @@ Example:
        W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
        $$
 
+  For exclusive = true:
+       $$
+       hstart = i * strides[0] - paddings[0]
+       hend = hstart + ksize[0]
+       wstart = j * strides[1] - paddings[1]
+       wend = wstart + ksize[1]
+       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
+       $$
+  For exclusive = false:
+       $$
+       hstart = max(0, i * strides[0] - paddings[0])
+       hend = min(H, hstart + ksize[0])
+       wstart = max(0, j * strides[1] - paddings[1])
+       wend = min(W, wstart + ksize[1])
+       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+       $$
+
 )DOC");
 }
 
@@ -252,8 +274,7 @@ void Pool3dOpMaker::Make() {
             "The format of output tensor is also NCDHW, "
             "where N is batch size, C is "
             "the number of channels, and D, H and W is the depth, height and "
-            "width of the feature, respectively.")
-      .Reuse("X");
+            "width of the feature, respectively.");
 
   AddAttr<std::string>("pooling_type",
                        "(string) Pooling type, can be \"max\" for max-pooling "
@@ -285,6 +306,12 @@ void Pool3dOpMaker::Make() {
       "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                                // TypedAttrChecker don't support vector type.)
+  AddAttr<bool>(
+      "exclusive",
+      "(bool, default True) When true, will exclude the zero-padding in the "
+      "averaging calculating, otherwise, include the zero-padding. Note, it "
+      "is only used when pooling_type is avg. The defalut is True.")
+      .SetDefault(true);
 
   AddAttr<bool>(
       "use_cudnn",
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index a63963ca92..c0594b7e3c 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -69,6 +69,7 @@ class PoolKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool exclusive = context.Attr<bool>("exclusive");
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[i] = 0;
@@ -84,7 +85,7 @@ class PoolKernel : public framework::OpKernel<T> {
               pool2d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
           pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
+                         true, out);
 
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool2dFunctor<
@@ -92,7 +93,7 @@ class PoolKernel : public framework::OpKernel<T> {
               pool2d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
           pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
+                         exclusive, out);
         }
       } break;
       case 3: {
@@ -102,14 +103,14 @@ class PoolKernel : public framework::OpKernel<T> {
               pool3d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
           pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
+                         true, out);
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool3dFunctor<
               DeviceContext, paddle::operators::math::AvgPool<T>, T>
               pool3d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
           pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         out);
+                         exclusive, out);
         }
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -131,6 +132,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
     std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool exclusive = context.Attr<bool>("exclusive");
 
     if (context.Attr<bool>("global_pooling")) {
       for (size_t i = 0; i < ksize.size(); ++i) {
@@ -157,7 +159,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
                 pool2d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
             pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, in_x_grad);
+                            paddings, pool_process, exclusive, in_x_grad);
           }
         } break;
         case 3: {
@@ -172,7 +174,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
                 pool3d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
             pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, in_x_grad);
+                            paddings, pool_process, exclusive, in_x_grad);
           }
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index 8cd5058060..dc0940ac0b 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -237,7 +237,7 @@ TEST(BlockingQueue, speed_test_mode) {
   }
   for (size_t i = 0; i < queue_size; ++i) {
     q2.Receive(&b);
-    EXPECT_EQ(b, 0);
+    EXPECT_EQ(b, 0UL);
   }
   EXPECT_EQ(q2.Size(), queue_size);
 }
diff --git a/paddle/fluid/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h
index 33e9babff2..ff035f421c 100644
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
@@ -17,7 +17,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 
 namespace paddle {
 namespace operators {
@@ -106,7 +106,7 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    math::ConcatGradFunctor<DeviceContext, T> functor;
+    math::SplitFunctor<DeviceContext, T> functor;
     std::vector<const framework::Tensor *> sliced_x_ptr;
     std::vector<framework::Tensor *> sliced_dx_ptr;
     for (auto &x : sliced_x) {
diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc
index 15d3f064eb..217bb1610f 100644
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor<int>) This tensor is used for the sequence max-pooling "
               "to record the max indexes.")
         .AsIntermediate();
+    AddAttr<bool>("is_test", "").SetDefault(false);
     AddAttr<std::string>(
         "pooltype",
         "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h
index 2aa20792f2..f2e4a55dee 100644
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<Tensor>("Out");
     std::string pooltype = context.Attr<std::string>("pooltype");
-    Tensor* index = nullptr;
-    if (pooltype == "MAX") {
-      index = context.Output<Tensor>("MaxIndex");
-    }
 
     auto dims = in->dims();
     auto lod = in->lod();
@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     dims[0] = lod[0].size() - 1;
     out->Resize({dims});
     out->mutable_data<T>(context.GetPlace());
-    if (pooltype == "MAX") {
+    Tensor* index = nullptr;
+
+    const bool is_test = context.Attr<bool>("is_test");
+
+    // Do not create index buffer for inference (is_test) mode
+    // TODO(jczaja): Skip index buffer creation for other devices eg. GPU
+    if (pooltype == "MAX" &&
+        (is_test == false ||
+         platform::is_cpu_place(context.GetPlace()) == false)) {
+      index = context.Output<Tensor>("MaxIndex");
       index->Resize({dims});
       index->mutable_data<int>(context.GetPlace());
     }
     math::SequencePoolFunctor<DeviceContext, T> pool;
     pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
-         index);
+         is_test, index);
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_reverse_op.cc b/paddle/fluid/operators/sequence_reverse_op.cc
new file mode 100644
index 0000000000..1428cca1a6
--- /dev/null
+++ b/paddle/fluid/operators/sequence_reverse_op.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_reverse_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sequence_reverse, ops::SequenceReverseOp,
+                  ops::SequenceReverseOpMaker,
+                  ops::SequenceReverseGradOpDescMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    sequence_reverse,
+    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_reverse_op.cu b/paddle/fluid/operators/sequence_reverse_op.cu
new file mode 100644
index 0000000000..ce65f4799e
--- /dev/null
+++ b/paddle/fluid/operators/sequence_reverse_op.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_reverse_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    sequence_reverse,
+    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_reverse_op.h b/paddle/fluid/operators/sequence_reverse_op.h
new file mode 100644
index 0000000000..39dad2311b
--- /dev/null
+++ b/paddle/fluid/operators/sequence_reverse_op.h
@@ -0,0 +1,157 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/algorithm.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceReverseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
+
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dim.size(), 2,
+                      "Rank of Input(X) must be not less than 2.");
+
+    ctx->SetOutputDim("Y", x_dim);
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class SequenceReverseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input LoDTensor of sequence_reverse op.");
+    AddOutput("Y", "The output LoDTensor of sequence_reverse op.");
+    AddComment(R"DOC(
+SequenceReverse Operator.
+
+Reverse each sequence in input X along dim 0.
+
+Assuming X is a LoDTensor with dims [5, 4] and lod [[0, 2, 5]], where:
+
+X.data() = [
+  [1, 2, 3, 4],
+  [5, 6, 7, 8], # the 0-th sequence with length 2
+  [9, 10, 11, 12],
+  [13, 14, 15, 16],
+  [17, 18, 19, 20] # the 1-st sequence with length 3
+]
+
+The output Y would be a LoDTensor sharing the same dims and lod with input X,
+and:
+
+Y.data() = [
+  [5, 6, 7, 8],
+  [1, 2, 3, 4], # the reversed 0-th sequence with length 2
+  [17, 18, 19, 20],
+  [13, 14, 15, 16],
+  [9, 10, 11, 12] # the reversed 1-st sequence with length 3
+]
+
+This Operator is useful to build a reverse dynamic RNN network.
+
+This Operator only supports one-level lod currently.
+    )DOC");
+  }
+};
+
+template <typename T>
+struct SequenceReverseFunctor {
+  SequenceReverseFunctor(const T *x, T *y, const size_t *lod, size_t lod_count,
+                         size_t row_numel)
+      : x_(x), y_(y), lod_(lod), lod_count_(lod_count), row_numel_(row_numel) {}
+
+  HOSTDEVICE void operator()(size_t idx_x) const {
+    auto row_idx_x = idx_x / row_numel_;
+    auto lod_idx = math::UpperBound(lod_, lod_count_, row_idx_x);
+    auto row_idx_y = lod_[lod_idx - 1] + (lod_[lod_idx] - 1 - row_idx_x);
+    auto idx_y = row_idx_y * row_numel_ + idx_x % row_numel_;
+    y_[idx_y] = x_[idx_x];
+  }
+
+  const T *x_;
+  T *y_;
+  const size_t *lod_;
+  size_t lod_count_;
+  size_t row_numel_;
+};
+
+template <typename DeviceContext, typename T>
+class SequenceReverseOpKernel : public framework::OpKernel<T> {
+  using LoDTensor = framework::LoDTensor;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &x = *ctx.Input<LoDTensor>("X");
+    auto *y = ctx.Output<LoDTensor>("Y");
+
+    PADDLE_ENFORCE_EQ(x.lod().size(), 1,
+                      "SequenceReverse Op only support one level lod.");
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    const size_t *lod;
+    size_t lod_count = x.lod()[0].size();
+
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      lod = x.lod()[0].CUDAData(ctx.GetPlace());
+    } else {
+#endif
+      lod = x.lod()[0].data();
+#ifdef PADDLE_WITH_CUDA
+    }
+#endif
+
+    size_t limit = static_cast<size_t>(x.numel());
+    size_t row_numel = static_cast<size_t>(limit / x.dims()[0]);
+    auto *x_data = x.data<T>();
+    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
+
+    PADDLE_ENFORCE_NE(x_data, y_data,
+                      "SequenceReverse Op does not support in-place operation");
+
+    SequenceReverseFunctor<T> functor(x_data, y_data, lod, lod_count,
+                                      row_numel);
+    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+    for_range(functor);
+  }
+};
+
+class SequenceReverseGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("sequence_reverse");
+    op->SetInput("X", OutputGrad("Y"));
+    op->SetOutput("Y", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index 411a126bc8..ea62acd08c 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -77,8 +77,7 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
     AddOutput("ParamOut",
               "(Tensor or SelectedRows, same with Param) "
-              "Output parameter, should share the same memory with Param")
-        .Reuse("Param");
+              "Output parameter, should share the same memory with Param");
     AddComment(R"DOC(
 
 SGD operator
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index 2bdb23e999..f6e241af06 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -76,6 +76,8 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
                    ops::SoftmaxCUDNNKernel<float>,
+                   ops::SoftmaxCUDNNKernel<double>,
                    ops::SoftmaxCUDNNKernel<plat::float16>);
 REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>);
+                   ops::SoftmaxGradCUDNNKernel<float>,
+                   ops::SoftmaxGradCUDNNKernel<double>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index bb08123882..a4bdbe6648 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -80,8 +80,7 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "The input tensor of softmax, "
              "whose last dimension is the input_feature_dimensions.");
-    AddOutput("Out", "The normalized values with the same shape as X.")
-        .Reuse("X");
+    AddOutput("Out", "The normalized values with the same shape as X.");
     AddAttr<bool>(
         "use_cudnn",
         "(bool, default false) Only used in cudnn kernel, need install cudnn")
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 1a9324ec86..2900221485 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker
         "(bool, default: false), A flag to indicate whether to interpretate "
         "the given labels as soft labels.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "numeric_stable_mode",
+        "(bool, default: false), A flag to indicate whether to use more "
+        "numerically stable algorithm. This flag is only valid when "
+        "soft_label is false and GPU is used.")
+        .SetDefault(false);
     AddAttr<int>(
         "ignore_index",
         "(int, default -100), Specifies a target value that is ignored and"
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index a07c17348e..6d48796191 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cub/cub.cuh>
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
@@ -117,8 +118,8 @@ using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
 // Make sure that BlockDim <= feature_size
 // This kernel is used to calculate the max element of each row
 template <typename T, int BlockDim>
-__global__ void RowReductionForMax(const T* logits_data, T* max_data,
-                                   int feature_size) {
+static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
+                                          int feature_size) {
   __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
 
   auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
@@ -141,9 +142,10 @@ __global__ void RowReductionForMax(const T* logits_data, T* max_data,
 }
 
 // Make sure that BlockDim <= feature_size
-template <typename T, int BlockDim>
-__global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
-                                          T* softmax, int feature_size) {
+template <typename T, int BlockDim, bool CalculateLogSoftmax = false>
+static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
+                                                 T* max_data, T* softmax,
+                                                 int feature_size) {
   __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
 
   auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
@@ -153,24 +155,34 @@ __global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
 
   softmax[beg_idx] = logits_data[beg_idx] - block_max;
   T diff_max_sum = real_exp(softmax[beg_idx]);
-  beg_idx += BlockDim;
-  while (beg_idx < end_idx) {
-    softmax[beg_idx] = logits_data[beg_idx] - block_max;
-    diff_max_sum += real_exp(softmax[beg_idx]);
-    beg_idx += BlockDim;
+  auto idx = beg_idx + BlockDim;
+  while (idx < end_idx) {
+    softmax[idx] = logits_data[idx] - block_max;
+    diff_max_sum += real_exp(softmax[idx]);
+    idx += BlockDim;
   }
 
   diff_max_sum =
       BlockReduce<T, BlockDim>(temp_storage).Reduce(diff_max_sum, cub::Sum());
   if (threadIdx.x == 0) max_data[blockIdx.x] = real_log(diff_max_sum);
+
+  if (!CalculateLogSoftmax) return;
+  __syncthreads();
+  diff_max_sum = max_data[blockIdx.x];
+  softmax[beg_idx] -= diff_max_sum;
+  beg_idx += BlockDim;
+  while (beg_idx < end_idx) {
+    softmax[beg_idx] -= diff_max_sum;
+    beg_idx += BlockDim;
+  }
+  if (threadIdx.x == 0) max_data[blockIdx.x] = 0;
 }
 
 // Make sure that BlockDim <= feature_size
 template <typename T, int BlockDim>
-__global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
-                                                      const T* labels_data,
-                                                      T* loss_data, T* softmax,
-                                                      int feature_size) {
+static __global__ void RowReductionForSoftmaxAndCrossEntropy(
+    const T* logits_data, const T* labels_data, T* loss_data, T* softmax,
+    int feature_size) {
   __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
 
   auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
@@ -194,11 +206,134 @@ __global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
 }
 
 template <typename T>
-__global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, int batch_size) {
+struct HardLabelSoftmaxWithCrossEntropyFunctor {
+ public:
+  HardLabelSoftmaxWithCrossEntropyFunctor(const T* logits,
+                                          const int64_t* labels, T* loss,
+                                          T* log_softmax, int feature_size)
+      : logits_(logits),
+        labels_(labels),
+        loss_(loss),
+        log_softmax_(log_softmax),
+        feature_size_(feature_size) {}
+
+  __device__ void operator()(int idx) const {
+    auto row_idx = idx / feature_size_;
+    auto col_idx = idx % feature_size_;
+    if (col_idx != labels_[row_idx]) {
+      log_softmax_[idx] = real_exp(log_softmax_[idx]);
+    } else {
+      auto softmax = log_softmax_[idx];
+      log_softmax_[idx] = real_exp(softmax);
+      loss_[row_idx] = -softmax;
+    }
+  }
+
+ private:
+  const T* logits_;
+  const int64_t* labels_;
+  T* loss_;
+  T* log_softmax_;
+  int feature_size_;
+};
+
+template <typename T>
+struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
+ public:
+  HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const T* logits,
+                                                       const int64_t* labels,
+                                                       T* loss, T* log_softmax,
+                                                       int feature_size,
+                                                       int ignore_idx)
+      : logits_(logits),
+        labels_(labels),
+        loss_(loss),
+        log_softmax_(log_softmax),
+        feature_size_(feature_size),
+        ignore_idx_(ignore_idx) {}
+
+  __device__ void operator()(int idx) const {
+    auto row_idx = idx / feature_size_;
+    auto col_idx = idx % feature_size_;
+    if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) {
+      log_softmax_[idx] = real_exp(log_softmax_[idx]);
+    } else {
+      auto softmax = log_softmax_[idx];
+      log_softmax_[idx] = real_exp(softmax);
+      loss_[row_idx] = -softmax;
+    }
+  }
+
+ private:
+  const T* logits_;
+  const int64_t* labels_;
+  T* loss_;
+  T* log_softmax_;
+  int feature_size_;
+  int ignore_idx_;
+};
+
+template <typename T>
+static __global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out,
+                                                           int batch_size) {
   auto idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < batch_size) out[idx] = static_cast<T>(1);
 }
 
+template <typename T>
+static void HardLabelSoftmaxWithCrossEntropy(
+    const platform::CUDADeviceContext& ctx, const T* logits_data,
+    const int64_t* labels_data, T* loss_data, T* softmax_data, int batch_size,
+    int feature_size, int ignore_idx) {
+  constexpr int kMaxBlockDim = 512;
+  int block_dim = feature_size >= kMaxBlockDim
+                      ? kMaxBlockDim
+                      : (1 << static_cast<int>(std::log2(feature_size)));
+  auto stream = ctx.stream();
+
+#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)    \
+  case BlockDim: {                                                           \
+    RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(    \
+        logits_data, loss_data, feature_size);                               \
+    RowReductionForDiffMaxSum<T, BlockDim,                                   \
+                              true><<<batch_size, BlockDim, 0, stream>>>(    \
+        logits_data, loss_data, softmax_data, feature_size);                 \
+    platform::ForRange<platform::CUDADeviceContext> for_range(               \
+        ctx, batch_size* feature_size);                                      \
+    if (ignore_idx >= 0 && ignore_idx < feature_size) {                      \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(     \
+          logits_data, labels_data, loss_data, softmax_data, feature_size,   \
+          ignore_idx));                                                      \
+    } else {                                                                 \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                  \
+          logits_data, labels_data, loss_data, softmax_data, feature_size)); \
+    }                                                                        \
+  } break
+
+  switch (block_dim) {
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
+    case 1:
+      SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) /
+                                                kMaxBlockDim,
+                                            kMaxBlockDim, 0, stream>>>(
+          softmax_data, batch_size);
+      cudaMemsetAsync(loss_data, 0, batch_size * sizeof(T), stream);
+      break;
+    default:
+      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      break;
+  }
+#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
+}
+
 template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
                                                const T* labels_data,
@@ -237,7 +372,7 @@ static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
                                                 kMaxBlockDim,
                                             kMaxBlockDim, 0, stream>>>(
           softmax_data, batch_size);
-      cudaMemsetAsync(loss_data, 0, batch_size, stream);
+      cudaMemsetAsync(loss_data, 0, batch_size * sizeof(T), stream);
       break;
     default:
       PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
@@ -272,11 +407,21 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
           logits_data, labels_data, softmax_data, loss_data, batch_size,
           feature_size, context.cuda_device_context().stream());
     } else {
-      math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
-                                     softmax);
-      math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-          context.cuda_device_context(), loss, softmax, labels, false,
-          ignore_index);
+      if (!context.Attr<bool>("numeric_stable_mode")) {
+        math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
+                                       softmax);
+        math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+            context.cuda_device_context(), loss, softmax, labels, false,
+            ignore_index);
+      } else {
+        int batch_size = logits->dims()[0];
+        int feature_size = logits->dims()[1];
+        auto* logits_data = logits->data<T>();
+        auto* labels_data = labels->data<int64_t>();
+        HardLabelSoftmaxWithCrossEntropy<T>(
+            context.cuda_device_context(), logits_data, labels_data, loss_data,
+            softmax_data, batch_size, feature_size, ignore_index);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc
index c867c46873..243f81e296 100644
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -20,20 +20,27 @@ namespace operators {
 class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
-    AddOutput("Out", "(LoDTensor) The outputs of the input Ids.")
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
+        .AsDuplicable();
+
+    AddOutput("Out", "(LoDTensors) The outputs of the input Ids.")
         .AsDuplicable();
 
     AddComment(R"DOC(
 Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
 Example:
   Input:
-    X = [1,2,3,4,5,6]
+    X = [[1,2,3,4,5,6],[2,3]]
 
   Out(3 output):
-    out0 = [3, 6]
-    out1 = [1, 4]
-    out2 = [2, 5]
+    if compress is True:
+        out0 = [3, 3, 6]
+        out1 = [1, 4]
+        out2 = [2, 2, 5]
+    else:
+        out0 = [3, 6]
+        out1 = [1, 4]
+        out2 = [2, 5]
 )DOC");
   }
 };
@@ -43,16 +50,24 @@ class SplitIdsOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasInputs("Ids"), "SplitIdsOp must has input Ids.");
     PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
 
     auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputDim("Ids");
+    auto ids_dims = ctx->GetInputsDim("Ids");
     if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+      PADDLE_ENFORCE_EQ(ids_dims[0].size(), 2);
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.MultiInput<framework::Tensor>("Ids").front()->type()),
+        ctx.GetPlace());
+  }
 };
 
 class SplitIdsOpInferVarType : public framework::VarTypeInference {
@@ -66,12 +81,28 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
   }
 };
 
+class SplitIdsOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad = new framework::OpDesc();
+    grad->SetType("concat");
+    grad->SetInput("X", OutputGrad("Out"));
+    grad->SetOutput("Out", InputGrad("Ids"));
+    grad->SetAttr("axis", 0);
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
-                  ops::SplitIdsOpInferVarType);
+                  ops::SplitIdsOpGradMaker, ops::SplitIdsOpInferVarType);
+
 REGISTER_OP_CPU_KERNEL(
     split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
     ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index c4af5a65fc..69ac6c5a6b 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <iterator>
+#include <set>
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
@@ -31,19 +33,39 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       PADDLE_THROW("SplitIds do not support GPU kernel");
     }
 
-    const auto *ids_var = ctx.InputVar("Ids");
+    const auto ids_vars = ctx.MultiInputVar("Ids");
+
+    PADDLE_ENFORCE_GT(ids_vars.size(), 0, "The number of Ids should > 0");
+    auto *ids_var = ids_vars[0];
+
     if (ids_var->IsType<framework::LoDTensor>()) {
-      const auto &ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
-      const T *ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
+      int batch_size = 0;
+      const auto ids_tensors = ctx.MultiInput<framework::LoDTensor>("Ids");
+      for (size_t i = 0; i < ids_tensors.size(); ++i) {
+        batch_size += ids_tensors[i]->dims()[0];
+      }
+      VLOG(4) << "Get Total BatchSize is: " << batch_size;
+
+      std::vector<T> all_ids(batch_size);
+      int offset = 0;
+      for (size_t i = 0; i < ids_tensors.size(); ++i) {
+        const auto *ids = ids_tensors[i];
+        std::memcpy(all_ids.data() + offset, ids->data<T>(),
+                    ids->numel() * sizeof(T));
+        offset += ids->numel();
+      }
+
+      std::set<T> st(all_ids.begin(), all_ids.end());
+      all_ids.assign(st.begin(), st.end());
+
       auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
       const size_t shard_num = outs.size();
-
       std::vector<std::vector<T>> out_ids;
       out_ids.resize(outs.size());
 
       // split id by their shard_num.
-      for (int i = 0; i < ids_dims[0]; ++i) {
-        T id = ids[i];
+      for (int i = 0; i < all_ids.size(); ++i) {
+        T id = all_ids[i];
         size_t shard_id = static_cast<size_t>(id) % shard_num;
         out_ids[shard_id].push_back(id);
       }
@@ -64,7 +86,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(ids_dims[0],
                         static_cast<int64_t>(ids_selected_rows->rows().size()),
                         "");
-      const T *ids = ids_selected_rows->value().data<T>();
+      const T *ids_data = ids_selected_rows->value().data<T>();
       const auto &ids_rows = ids_selected_rows->rows();
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
       const size_t shard_num = outs.size();
@@ -87,7 +109,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
         T *output = out->mutable_value()->mutable_data<T>(ddim, place);
         for (int64_t i = 0; i < ddim[0]; ++i) {
           memcpy(output + i * row_width,
-                 ids + id_to_index[out->rows()[i]] * row_width,
+                 ids_data + id_to_index[out->rows()[i]] * row_width,
                  row_width * sizeof(T));
         }
       }
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index d661b276bc..a05582ae09 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -111,11 +111,10 @@ Example:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-USE_CPU_ONLY_OP(concat);
 
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
-REGISTER_OP_CPU_KERNEL(split,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, double>,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, int64_t>,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, int>);
+REGISTER_OP_CPU_KERNEL(
+    split, ops::SplitOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SplitOpKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index f0c417c705..6f4a25ab5e 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <chrono>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
@@ -28,18 +29,22 @@ class SplitOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto in_stride = framework::stride_numel(in->dims());
-    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    int axis = ctx.Attr<int>("axis");
     auto place = ctx.GetPlace();
 
-    size_t input_offset = 0;
-    for (auto& out : outs) {
-      out->mutable_data<T>(ctx.GetPlace());
-      auto out_stride = framework::stride_numel(out->dims());
-      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
-                                  out_stride, in->data<T>() + input_offset,
-                                  in_stride, out_stride[axis]);
-      input_offset += out_stride[axis];
+    std::vector<const framework::Tensor*> shape_refer;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      outs[j]->mutable_data<T>(ctx.GetPlace());
+      shape_refer.emplace_back(outs[j]);
+    }
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
+    if (axis == 0 && outs.size() < 10) {
+      StridedMemcpyWithAxis0<T>(dev_ctx, *in, shape_refer, &outs);
+    } else {
+      math::SplitFunctor<DeviceContext, T> functor;
+      functor(dev_ctx, *in, shape_refer, axis, &outs);
     }
   }
 };
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index 76615a9405..0e7b1463d1 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -22,9 +22,9 @@ class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "The input SelectedRows.");
     AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
-    AddAttr<std::vector<int>>("height_sections",
-                              "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<int64_t>>("height_sections",
+                                  "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int64_t>({}));
 
     AddComment(R"DOC(
 Split a SelectedRows with a specified rows section.
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
index 0e9ce165b9..af64607faf 100644
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-static int FindOutIdx(int row, const std::vector<int>& abs_sections) {
+static int FindOutIdx(int row, const std::vector<int64_t>& abs_sections) {
   for (size_t i = 1; i < abs_sections.size(); ++i) {
     if (row < abs_sections[i]) {
       return i - 1;
@@ -30,9 +30,9 @@ static int FindOutIdx(int row, const std::vector<int>& abs_sections) {
   return abs_sections.size() - 1;
 }
 
-static std::vector<int> ToAbsoluteSection(
-    const std::vector<int>& height_sections) {
-  std::vector<int> abs_sections;
+static std::vector<int64_t> ToAbsoluteSection(
+    const std::vector<int64_t>& height_sections) {
+  std::vector<int64_t> abs_sections;
   abs_sections.resize(height_sections.size());
   abs_sections[0] = 0;
   for (size_t i = 1; i < height_sections.size(); ++i) {
@@ -47,7 +47,7 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<framework::SelectedRows>("X");
     auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-    auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
+    auto height_sections = ctx.Attr<std::vector<int64_t>>("height_sections");
 
     auto abs_sections = ToAbsoluteSection(height_sections);
 
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index 08cb7849d2..35d9737ee0 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -56,12 +56,14 @@ class SppKernel : public framework::OpKernel<T> {
         math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
         math::MaxPool<T> max_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, max_process, &out_level);
+                     kernel_size, strides, paddings, max_process, true,
+                     &out_level);
       } else if (pooling_type == "avg") {
         math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
         math::AvgPool<T> avg_process;
         pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, avg_process, &out_level);
+                     kernel_size, strides, paddings, avg_process, true,
+                     &out_level);
       }
       // flatten pooling output shape
       int output_flatten_w = in_x->dims()[1] * bins * bins;
@@ -154,7 +156,7 @@ class SppGradKernel : public framework::OpKernel<T> {
         math::AvgPoolGrad<T> avg_process;
         pool_backward(context.template device_context<DeviceContext>(), *in_x,
                       *&out_level, *&outgrad_level, kernel_size, strides,
-                      paddings, avg_process, in_x_grad);
+                      paddings, avg_process, true, in_x_grad);
       }
     }
   }
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 7a10218e15..c3d83a06f2 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/detail/strided_memcpy.h"
-
 namespace paddle {
 namespace operators {
 
@@ -98,5 +99,26 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
   }
 }
 
+template <typename T>
+inline void StridedMemcpyWithAxis0(
+    const platform::DeviceContext& dev_ctx, const framework::Tensor& input,
+    const std::vector<const framework::Tensor*>& shape_refer,
+    std::vector<framework::Tensor*>* outputs) {
+  const framework::DDim in_stride = stride_numel(input.dims());
+  const int axis = 0;
+  size_t input_offset = 0;
+
+  for (size_t i = 0; i < outputs->size(); ++i) {
+    auto out_stride = stride_numel(shape_refer[i]->dims());
+    auto out = outputs->at(i);
+    if (out != nullptr) {
+      StridedNumelCopyWithAxis<T>(dev_ctx, axis, out->data<T>(), out_stride,
+                                  input.data<T>() + input_offset, in_stride,
+                                  out_stride[axis]);
+    }
+    input_offset += out_stride[axis];
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index fe7c7039c7..d19ac9839c 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -67,6 +67,7 @@ class SumOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto x_vars = ctx.MultiInputVar("X");
+    auto x_vars_name = ctx.Inputs("X");
 
     framework::LibraryType library{framework::LibraryType::kPlain};
     framework::DataLayout layout{framework::DataLayout::kAnyLayout};
@@ -81,15 +82,18 @@ class SumOp : public framework::OperatorWithKernel {
 
     if (x_vars[0]->IsType<framework::LoDTensor>()) {
       int dtype = -1;
-      for (auto& x_var : x_vars) {
-        auto& lod_tensor = x_var->Get<framework::LoDTensor>();
-        if (lod_tensor.numel() == 0) {
+      for (size_t idx = 0; idx < x_vars.size(); ++idx) {
+        PADDLE_ENFORCE(x_vars[idx] != nullptr,
+                       "Input var[%s] should not be nullptr", x_vars_name[idx]);
+        // FIXME(zcd): The input x_var may be SelectedRows or LoDTensor.
+        auto tensor = framework::GetTensorFromVar(*x_vars[idx]);
+        if (tensor->numel() == 0) {
           continue;
         }
         if (dtype == -1) {
-          dtype = framework::ToDataType(lod_tensor.type());
+          dtype = framework::ToDataType(tensor->type());
         } else {
-          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
+          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type()));
         }
       }
       PADDLE_ENFORCE_NE(dtype, -1,
@@ -132,7 +136,7 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
         .AsDuplicable();
-    AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X");
+    AddOutput("Out", "(Tensor) The output tensor of sum operator.");
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 11987c61ae..f6e12dfc76 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -83,79 +83,54 @@ class SumKernel : public framework::OpKernel<T> {
         }
       }
     } else if (out_var->IsType<framework::SelectedRows>()) {
-      std::unique_ptr<framework::SelectedRows> in0;
-      if (in_place) {
-        // If is in_place, we store the input[0] to in0
-        auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
-        auto &rows = in_sel0.rows();
-#ifdef PADDLE_WITH_CUDA
-        std::vector<int64_t> rows_in_cpu;
-        rows_in_cpu.reserve(rows.size());
-        for (auto item : rows) {
-          rows_in_cpu.push_back(item);
-        }
-        in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
-#else
-        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
-#endif
-        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      if (in_place && in_vars.size() < 2) {
+        return;
       }
 
-      auto get_selected_row = [&](size_t i) -> const SelectedRows & {
-        if (i == 0 && in0) {
-          return *in0.get();
-        } else {
-          return in_vars[i]->Get<SelectedRows>();
+      std::vector<const paddle::framework::SelectedRows *> inputs;
+      SelectedRows temp_in0;
+
+      if (in_place) {
+        auto &in0 = in_vars[0]->Get<SelectedRows>();
+        temp_in0.set_height(in0.height());
+        temp_in0.set_rows(in0.rows());
+        framework::TensorCopy(in0.value(), in0.place(),
+                              context.device_context(),
+                              temp_in0.mutable_value());
+        inputs.push_back(&temp_in0);
+        for (size_t i = 1; i < in_vars.size(); ++i) {
+          auto &in = in_vars[i]->Get<SelectedRows>();
+          if (in.rows().size() > 0) {
+            inputs.push_back(&in);
+          }
+        }
+      } else {
+        for (auto &in_var : in_vars) {
+          auto &in = in_var->Get<SelectedRows>();
+          if (in.rows().size() > 0) {
+            inputs.push_back(&in_var->Get<SelectedRows>());
+          }
         }
-      };
+      }
 
       auto *out = context.Output<SelectedRows>("Out");
       out->mutable_rows()->clear();
-      auto *out_value = out->mutable_value();
-
-      // Runtime InferShape
-      size_t first_dim = 0;
-      for (size_t i = 0; i < in_num; i++) {
-        auto &sel_row = get_selected_row(i);
-        first_dim += sel_row.rows().size();
-      }
 
-      std::vector<int64_t> in_dim;
-      for (size_t i = 0; i < in_num; i++) {
-        auto &sel_row = get_selected_row(i);
-        if (sel_row.rows().size() > 0) {
-          in_dim = framework::vectorize(sel_row.value().dims());
+      bool has_data = false;
+      for (auto &in : inputs) {
+        if (in->rows().size() > 0) {
+          has_data = true;
           break;
         }
       }
-      if (in_dim.empty()) {
-        VLOG(3) << "WARNING: all the inputs are empty";
-        in_dim =
-            framework::vectorize(get_selected_row(in_num - 1).value().dims());
+      if (has_data) {
+        math::scatter::MergeAdd<DeviceContext, T> merge_add;
+        merge_add(context.template device_context<DeviceContext>(), inputs,
+                  out);
       } else {
-        in_dim[0] = static_cast<int64_t>(first_dim);
-      }
-
-      out_value->Resize(framework::make_ddim(in_dim));
-      out_value->mutable_data<T>(context.GetPlace());
-      // if all the input sparse vars are empty, no need to
-      // merge these vars.
-      if (first_dim == 0UL) {
-        return;
-      }
-
-      math::SelectedRowsAddTo<DeviceContext, T> functor;
-
-      int64_t offset = 0;
-      for (size_t i = 0; i < in_num; i++) {
-        auto &sel_row = get_selected_row(i);
-        if (sel_row.rows().size() == 0) {
-          continue;
-        }
-        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
-        functor(context.template device_context<DeviceContext>(), sel_row,
-                offset, out);
-        offset += sel_row.value().numel();
+        // no data, just set a empty out tensor.
+        out->mutable_value()->mutable_data<T>(framework::make_ddim({0}),
+                                              context.GetPlace());
       }
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 4a8ac441cf..c17d1afc30 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "(Tensor) The input of Topk op");
-    AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
     AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
     AddComment(R"DOC(
 Top K operator
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 8e4a07556f..0cad224ca8 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -262,31 +262,31 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
                              const T* src, int lds, int dim, int k,
                              int grid_dim, int num) {
   __shared__ Pair<T> sh_topk[BlockSize];
-  __shared__ int maxid[BlockSize / 2];
   const int tid = threadIdx.x;
   const int warp = threadIdx.x / 32;
 
   const int bid = blockIdx.x;
   for (int i = bid; i < num; i += grid_dim) {
-    output += i * output_stride;
-    indices += i * k;
-
+    int top_num = k;
+    __shared__ int maxid[BlockSize / 2];
+    T* out = output + i * output_stride;
+    int64_t* inds = indices + i * k;
     Pair<T> topk[MaxLength];
     int beam = MaxLength;
     Pair<T> max;
     bool is_empty = false;
     bool firststep = true;
 
-    for (int k = 0; k < MaxLength; k++) {
-      topk[k].set(-INFINITY, -1);
+    for (int j = 0; j < MaxLength; j++) {
+      topk[j].set(-INFINITY, -1);
     }
-    while (k) {
+    while (top_num) {
       ThreadGetTopK<T, MaxLength, BlockSize>(
           topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
 
       sh_topk[tid] = topk[0];
-      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
-                                           &indices, &beam, &k, tid, warp);
+      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
+                                           &beam, &top_num, tid, warp);
     }
   }
 }
@@ -327,13 +327,15 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
     const T* input_data = input->data<T>();
-
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     // FIXME(typhoonzero): data is always converted to type T?
     int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
 
-    size_t input_height = input->dims()[0];
-    size_t input_width = input->dims()[1];
+    framework::DDim inputdims = input->dims();
+    const size_t input_height = framework::product(
+        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
+    const size_t input_width = inputdims[inputdims.size() - 1];
+
     if (k > input_width) k = input_width;
 
     // NOTE: pass lds and dim same to input width.
@@ -342,14 +344,12 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     const int kMaxHeight = 2048;
     int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
     auto& dev_ctx = ctx.cuda_device_context();
-
     switch (GetDesiredBlockDim(input_width)) {
       FIXED_BLOCK_DIM(
           KeMatrixTopK<T, 5,
                        kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-              output_data, output->dims()[1], indices_data, input_data,
-              input_width, input_width, static_cast<int>(k), gridx,
-              input_height));
+              output_data, k, indices_data, input_data, input_width,
+              input_width, static_cast<int>(k), gridx, input_height));
       default:
         PADDLE_THROW("Error");
     }
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 054dd48199..76ece57b39 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -34,7 +34,6 @@ class TopkKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
-    // FIXME: only deal with matrix(2d tensor).
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
@@ -44,8 +43,6 @@ class TopkKernel : public framework::OpKernel<T> {
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
 
-    auto eg_input = EigenMatrix<T>::From(*input);
-
     // reshape input to a flattern matrix(like flat_inner_dims)
     framework::DDim inputdims = input->dims();
     const size_t row = framework::product(
@@ -53,7 +50,7 @@ class TopkKernel : public framework::OpKernel<T> {
     const size_t col = inputdims[inputdims.size() - 1];
     Eigen::DSizes<int, 2> flat2dims(row, col);
     // NOTE: eigen shape doesn't affect paddle tensor.
-    eg_input.reshape(flat2dims);
+    auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
 
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 6a9fc6611a..bbd71db606 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -210,18 +210,21 @@ REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
 REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
-    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>);
 
 REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
                   ops::Transpose2GradMaker);
 REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
 
 REGISTER_OP_CPU_KERNEL(
-    transpose2,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+    transpose2, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     transpose2_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc
index c1b5a8b31b..b4025350fa 100644
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -16,15 +16,18 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    transpose,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+    transpose, ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>);
 
 REGISTER_OP_CUDA_KERNEL(
     transpose2,
-    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     transpose2_grad,
-    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index aa907595cb..e3132ae76f 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -29,7 +29,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
     } else if (out_var->IsType<framework::SelectedRows>()) {
-      auto shape = ctx.Attr<std::vector<int>>("shape");
+      auto shape = ctx.Attr<std::vector<int64_t>>("shape");
       auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
       tensor = selected_rows->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
@@ -67,7 +67,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(
         ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
         "uniform_random's min must less then max");
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     std::vector<int64_t> temp;
     temp.reserve(shape.size());
     for (auto dim : shape) {
@@ -94,7 +94,7 @@ This operator initializes a tensor with random values sampled from a
 uniform distribution. The random result is in set [min, max].
 
 )DOC");
-    AddAttr<std::vector<int>>("shape", "The shape of the output tensor");
+    AddAttr<std::vector<int64_t>>("shape", "The shape of the output tensor");
     AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].")
         .SetDefault(-1.0f);
     AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].")
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index bbb692b0dd..2bb0ecc139 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -48,7 +48,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
     } else if (out_var->IsType<framework::SelectedRows>()) {
-      auto shape = context.Attr<std::vector<int>>("shape");
+      auto shape = context.Attr<std::vector<int64_t>>("shape");
       tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
     } else {
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index bb8b14bb9f..07bb02be19 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -76,8 +76,9 @@ enum class DataLayout {  // Not use
 
 enum class PoolingMode {
   kMaximum,
-  kAverage,
   kMaximumDeterministic,
+  kAverageExclusive,
+  kAverageInclusive,
 };
 
 #if CUDNN_VERSION < 6000
@@ -91,8 +92,10 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
   switch (mode) {
     case PoolingMode::kMaximumDeterministic:
       return CUDNN_POOLING_MAX;
-    case PoolingMode::kAverage:
+    case PoolingMode::kAverageExclusive:
       return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    case PoolingMode::kAverageInclusive:
+      return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
     case PoolingMode::kMaximum:
       return CUDNN_POOLING_MAX;
     default:
@@ -105,8 +108,10 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
   switch (mode) {
     case PoolingMode::kMaximumDeterministic:
       return CUDNN_POOLING_MAX_DETERMINISTIC;
-    case PoolingMode::kAverage:
+    case PoolingMode::kAverageExclusive:
       return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    case PoolingMode::kAverageInclusive:
+      return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
     case PoolingMode::kMaximum:
       return CUDNN_POOLING_MAX;
     default:
@@ -341,6 +346,28 @@ class ScopedPoolingDescriptor {
   DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
 };
 
+class ScopedSpatialTransformerDescriptor {
+ public:
+  ScopedSpatialTransformerDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
+  }
+  ~ScopedSpatialTransformerDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
+  }
+
+  template <typename T>
+  inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims,
+                                                        const int dimA[]) {
+    PADDLE_ENFORCE(dynload::cudnnSetSpatialTransformerNdDescriptor(
+        desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType<T>::type, nbDims, dimA));
+    return desc_;
+  }
+
+ private:
+  cudnnSpatialTransformerDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedSpatialTransformerDescriptor);
+};
+
 inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
   bool use_cudnn = ctx.Attr<bool>("use_cudnn");
   use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 7d1cf57253..924810bd61 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -32,23 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
         "'Place' is not supported, Please re-compile with WITH_GPU "
         "option");
   }
-  return it->second.get();
+  return it->second.get().get();
 }
 
-const std::vector<const DeviceContext*>
-DeviceContextPool::GetAllDeviceContexts() const {
-  std::vector<const DeviceContext*> all_device_ctx;
-  all_device_ctx.reserve(device_contexts_.size());
-  for (auto& dev_ctx : device_contexts_) {
-    all_device_ctx.emplace_back(dev_ctx.second.get());
-  }
-  return all_device_ctx;
+template <typename DevCtx, typename PlaceType>
+inline void EmplaceDeviceContext(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        map_ptr,
+    platform::Place p) {
+  using PtrType = std::unique_ptr<DeviceContext>;
+  map_ptr->emplace(p, std::async(std::launch::deferred, [=] {
+                     // lazy evaluation. i.e., only create device context at
+                     // first `Get`
+                     return PtrType(new DevCtx(boost::get<PlaceType>(p)));
+                   }));
 }
 
 DeviceContextPool::DeviceContextPool(
     const std::vector<platform::Place>& places) {
   PADDLE_ENFORCE_GT(places.size(), 0);
-  using PtrType = std::unique_ptr<DeviceContext>;
   std::set<Place> set;
   for (auto& p : places) {
     set.insert(p);
@@ -57,16 +59,13 @@ DeviceContextPool::DeviceContextPool(
   for (auto& p : set) {
     if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
-      device_contexts_.emplace(
-          p, PtrType(new MKLDNNDeviceContext(boost::get<CPUPlace>(p))));
+      EmplaceDeviceContext<MKLDNNDeviceContext, CPUPlace>(&device_contexts_, p);
 #else
-      device_contexts_.emplace(
-          p, PtrType(new CPUDeviceContext(boost::get<CPUPlace>(p))));
+      EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
 #endif
     } else if (platform::is_gpu_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(
-          p, PtrType(new CUDADeviceContext(boost::get<CUDAPlace>(p))));
+      EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
 #else
       PADDLE_THROW(
           "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
@@ -74,9 +73,8 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_cuda_pinned_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(
-          p,
-          PtrType(new CUDAPinnedDeviceContext(boost::get<CUDAPinnedPlace>(p))));
+      EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
+          &device_contexts_, p);
 #else
       PADDLE_THROW(
           "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
@@ -296,38 +294,73 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_MKLDNN
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
-  p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
+    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobmap_() {
+  p_blobmap_.reset(new BlobMap());
+  p_mutex_.reset(new std::mutex());
+}
+
+namespace {
+// Current thread's id.
+thread_local int cur_thread_id = 0;
 }
 
+void set_cur_thread_id(int tid) { cur_thread_id = tid; }
+int get_cur_thread_id(void) { return cur_thread_id; }
+
 void MKLDNNDeviceContext::SetBlob(const std::string& name,
                                   std::shared_ptr<void> data) const {
-  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-  p = p_blobs_.get();
+  BlobMap* pMap = p_blobmap_.get();
+  std::shared_ptr<KeyBlob> pBlob = nullptr;
+
+  int tid = platform::get_cur_thread_id();
+
+  std::lock_guard<std::mutex> lock(*p_mutex_.get());
 
-  auto it = p->find(name);
+  // Find KeyBlob for current thread
+  auto map_it = pMap->find(tid);
 
-  if (it == p->end()) {
-    (*p)[name] = data;  // create new blob
+  if (map_it == pMap->end()) {
+    // 1st time to set blob in current thread
+    pBlob = std::shared_ptr<KeyBlob>(new KeyBlob());
+    (*pMap)[tid] = pBlob;
   } else {
-    it->second = data;  // set data to existing blob
+    pBlob = map_it->second;
   }
 
+  // Find Key in found (or newly created) KeyBlob
+  auto key_it = pBlob->find(name);
+
+  if (key_it == pBlob->end()) {
+    (*pBlob)[name] = data;  // create new blob
+  } else {
+    key_it->second = data;  // set data to existing blob
+  }
+
+  // lock will be automatically released when out of scope
   return;
 }
 
 std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
     const std::string& name) const {
-  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-  p = p_blobs_.get();
+  BlobMap* pMap = p_blobmap_.get();
+  std::shared_ptr<KeyBlob> pBlob = nullptr;
 
-  auto it = p->find(name);
+  int tid = platform::get_cur_thread_id();
 
-  if (it != p->end()) {
-    return it->second;
-  }
+  std::lock_guard<std::mutex> lock(*p_mutex_.get());
+
+  // Find KeyBlob for current thread firstly
+  auto map_it = pMap->find(tid);
+  if (map_it == pMap->end()) return nullptr;
+  pBlob = map_it->second;
+
+  // Find Blob via name
+  auto key_it = pBlob->find(name);
+
+  if (key_it == pBlob->end()) return nullptr;
 
-  return nullptr;
+  // lock will be automatically released when out of scope
+  return key_it->second;
 }
 
 #endif
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 999bbe00f1..0240b9380f 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <future>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
@@ -176,6 +177,12 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
+using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
+using BlobMap = std::unordered_map<int, std::shared_ptr<KeyBlob>>;
+
+void set_cur_thread_id(int);
+int get_cur_thread_id(void);
+
 class MKLDNNDeviceContext : public CPUDeviceContext {
  public:
   explicit MKLDNNDeviceContext(CPUPlace place);
@@ -191,8 +198,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 
  private:
   mkldnn::engine engine_;
-  std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
-      p_blobs_;
+  std::shared_ptr<BlobMap> p_blobmap_;
+  std::shared_ptr<std::mutex> p_mutex_;
 };
 #endif
 
@@ -217,9 +224,6 @@ class DeviceContextPool {
   /*! \brief  Return handle of single device context. */
   platform::DeviceContext* Get(const platform::Place& place);
 
-  /*! \brief  Return all the device contexts. */
-  const std::vector<const DeviceContext*> GetAllDeviceContexts() const;
-
   template <typename Place>
   const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
       const Place& place) {
@@ -231,7 +235,8 @@ class DeviceContextPool {
 
  private:
   static DeviceContextPool* pool;
-  std::map<Place, std::unique_ptr<DeviceContext>> device_contexts_;
+  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
+      device_contexts_;
   DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index e6353f67ef..d3d754b6f5 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -65,44 +65,51 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
-  __macro(cudnnSetTensor4dDescriptor);              \
-  __macro(cudnnSetTensor4dDescriptorEx);            \
-  __macro(cudnnSetTensorNdDescriptor);              \
-  __macro(cudnnGetTensorNdDescriptor);              \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
-  __macro(cudnnGetConvolutionForwardAlgorithm);     \
-  __macro(cudnnCreateTensorDescriptor);             \
-  __macro(cudnnDestroyTensorDescriptor);            \
-  __macro(cudnnCreateFilterDescriptor);             \
-  __macro(cudnnSetFilter4dDescriptor);              \
-  __macro(cudnnSetFilterNdDescriptor);              \
-  __macro(cudnnGetFilterNdDescriptor);              \
-  __macro(cudnnSetPooling2dDescriptor);             \
-  __macro(cudnnSetPoolingNdDescriptor);             \
-  __macro(cudnnGetPoolingNdDescriptor);             \
-  __macro(cudnnDestroyFilterDescriptor);            \
-  __macro(cudnnCreateConvolutionDescriptor);        \
-  __macro(cudnnCreatePoolingDescriptor);            \
-  __macro(cudnnDestroyPoolingDescriptor);           \
-  __macro(cudnnSetConvolution2dDescriptor);         \
-  __macro(cudnnDestroyConvolutionDescriptor);       \
-  __macro(cudnnSetConvolutionNdDescriptor);         \
-  __macro(cudnnGetConvolutionNdDescriptor);         \
-  __macro(cudnnDeriveBNTensorDescriptor);           \
-  __macro(cudnnCreate);                             \
-  __macro(cudnnDestroy);                            \
-  __macro(cudnnSetStream);                          \
-  __macro(cudnnActivationForward);                  \
-  __macro(cudnnConvolutionForward);                 \
-  __macro(cudnnConvolutionBackwardBias);            \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
-  __macro(cudnnTransformTensor);                    \
-  __macro(cudnnPoolingForward);                     \
-  __macro(cudnnPoolingBackward);                    \
-  __macro(cudnnSoftmaxBackward);                    \
-  __macro(cudnnSoftmaxForward);                     \
-  __macro(cudnnGetVersion);                         \
+#define CUDNN_DNN_ROUTINE_EACH(__macro)              \
+  __macro(cudnnSetTensor4dDescriptor);               \
+  __macro(cudnnSetTensor4dDescriptorEx);             \
+  __macro(cudnnSetTensorNdDescriptor);               \
+  __macro(cudnnGetTensorNdDescriptor);               \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);    \
+  __macro(cudnnGetConvolutionForwardAlgorithm);      \
+  __macro(cudnnCreateTensorDescriptor);              \
+  __macro(cudnnDestroyTensorDescriptor);             \
+  __macro(cudnnCreateFilterDescriptor);              \
+  __macro(cudnnSetFilter4dDescriptor);               \
+  __macro(cudnnSetFilterNdDescriptor);               \
+  __macro(cudnnGetFilterNdDescriptor);               \
+  __macro(cudnnSetPooling2dDescriptor);              \
+  __macro(cudnnSetPoolingNdDescriptor);              \
+  __macro(cudnnGetPoolingNdDescriptor);              \
+  __macro(cudnnDestroyFilterDescriptor);             \
+  __macro(cudnnCreateConvolutionDescriptor);         \
+  __macro(cudnnCreatePoolingDescriptor);             \
+  __macro(cudnnDestroyPoolingDescriptor);            \
+  __macro(cudnnSetConvolution2dDescriptor);          \
+  __macro(cudnnDestroyConvolutionDescriptor);        \
+  __macro(cudnnSetConvolutionNdDescriptor);          \
+  __macro(cudnnGetConvolutionNdDescriptor);          \
+  __macro(cudnnDeriveBNTensorDescriptor);            \
+  __macro(cudnnCreateSpatialTransformerDescriptor);  \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);   \
+  __macro(cudnnDestroySpatialTransformerDescriptor); \
+  __macro(cudnnSpatialTfGridGeneratorForward);       \
+  __macro(cudnnSpatialTfGridGeneratorBackward);      \
+  __macro(cudnnSpatialTfSamplerForward);             \
+  __macro(cudnnSpatialTfSamplerBackward);            \
+  __macro(cudnnCreate);                              \
+  __macro(cudnnDestroy);                             \
+  __macro(cudnnSetStream);                           \
+  __macro(cudnnActivationForward);                   \
+  __macro(cudnnConvolutionForward);                  \
+  __macro(cudnnConvolutionBackwardBias);             \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);  \
+  __macro(cudnnTransformTensor);                     \
+  __macro(cudnnPoolingForward);                      \
+  __macro(cudnnPoolingBackward);                     \
+  __macro(cudnnSoftmaxBackward);                     \
+  __macro(cudnnSoftmaxForward);                      \
+  __macro(cudnnGetVersion);                          \
   __macro(cudnnGetErrorString);
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 3b22718a8c..d3b0d4a229 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -57,6 +57,18 @@ struct variant_caster<V<Ts...>> {
     auto caster = make_caster<T>();
     if (!load_success_ && caster.load(src, convert)) {
       load_success_ = true;
+
+      if (std::is_same<T, std::vector<float>>::value) {
+        auto caster_ints = make_caster<std::vector<int64_t>>();
+        if (caster_ints.load(src, convert)) {
+          VLOG(4) << "This value are floats and int64_ts satisfy "
+                     "simultaneously, will set it's type to "
+                     "std::vector<int64_t>";
+          value = cast_op<std::vector<int64_t>>(caster_ints);
+          return true;
+        }
+      }
+
       value = cast_op<T>(caster);
       return true;
     }
@@ -259,6 +271,8 @@ void BindOpDesc(pybind11::module *m) {
   pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "")
       .value("INT", pd::proto::AttrType::INT)
       .value("INTS", pd::proto::AttrType::INTS)
+      .value("LONG", pd::proto::AttrType::LONG)
+      .value("LONGS", pd::proto::AttrType::LONGS)
       .value("FLOAT", pd::proto::AttrType::FLOAT)
       .value("FLOATS", pd::proto::AttrType::FLOATS)
       .value("STRING", pd::proto::AttrType::STRING)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 339a7c98c6..7c7b14df66 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -645,9 +645,13 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
-      .def("set_str", [](ir::Pass &self, const std::string &name,
-                         const std::string &attr) {
-        self.Set<std::string>(name, new std::string(attr));
+      .def(
+          "set_str",
+          [](ir::Pass &self, const std::string &name, const std::string &attr) {
+            self.Set<std::string>(name, new std::string(attr));
+          })
+      .def("set_int", [](ir::Pass &self, const std::string &name, int val) {
+        self.Set<const int>(name, new int(val));
       });
 
   py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
@@ -817,6 +821,13 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self, bool b) {
             self.enable_data_balance_ = b;
           })  // FIXME(chengudo): enable_data_balance seems not important
+      .def_property("enable_sequential_execution",
+                    [](const BuildStrategy &self) {
+                      return self.enable_sequential_execution_;
+                    },
+                    [](BuildStrategy &self, bool b) {
+                      self.enable_sequential_execution_ = b;
+                    })
       .def_property(
           "fuse_elewise_add_act_ops",
           [](const BuildStrategy &self) {
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
index 78d6e5ff55..eabb51d370 100644
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -15,6 +15,7 @@ include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
 include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
@@ -27,6 +28,7 @@ link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
 
 add_executable(demo_trainer demo_trainer.cc)
@@ -62,5 +64,5 @@ target_link_libraries(demo_trainer
         ${ARCHIVE_END}
         ${MATH_LIB}
         ${MKLDNN_LIB}
-        glog gflags protobuf snappystream snappy z
+        glog gflags protobuf snappystream snappy z xxhash
         ${EXTERNAL_LIB})
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 85493c1054..d7676f89ab 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -95,9 +95,9 @@ function cmake_gen() {
                 exit 1
             fi
         fi
-    else 
+    else
         if [ "$1" != "" ]; then
-            echo "using python abi: $1"     
+            echo "using python abi: $1"
             if [ "$1" == "cp27-cp27m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
                 export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
@@ -119,7 +119,7 @@ function cmake_gen() {
            fi
         fi
     fi
-    
+
     if [ "$SYSTEM" == "Darwin" ]; then
         WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON}
         WITH_AVX=${WITH_AVX:-ON}
@@ -127,7 +127,7 @@ function cmake_gen() {
     else
         INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
     fi
-    
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -147,13 +147,11 @@ function cmake_gen() {
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
         -DCUDNN_ROOT=/usr/
         -DWITH_TESTING=${WITH_TESTING:-ON}
-        -DWITH_FAST_BUNDLE_TEST=ON
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
@@ -181,12 +179,10 @@ EOF
         -DWITH_PYTHON=${WITH_PYTHON:-ON} \
         -DCUDNN_ROOT=/usr/ \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
-        -DWITH_FAST_BUNDLE_TEST=ON \
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
@@ -394,8 +390,8 @@ EOF
         export http_proxy=
         export https_proxy=
         # TODO: jiabin need to refine this part when these tests fixed on mac
-        ctest --output-on-failure -j $1     
-        # make install should also be test when unittest 
+        ctest --output-on-failure -j $1
+        # make install should also be test when unittest
         make install -j 8
         pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
@@ -653,20 +649,20 @@ function gen_capi_package() {
 function gen_fluid_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
         cat <<EOF
     ========================================
     Generating fluid library for train and inference ...
     ========================================
 EOF
-        cmake .. -DWITH_DISTRIBUTE=OFF
+        cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
         make -j `nproc` fluid_lib_dist
         make -j `nproc` inference_lib_dist
       fi
 }
 
 function tar_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
         cat <<EOF
     ========================================
     Taring fluid library for train and inference ...
@@ -681,7 +677,7 @@ EOF
 }
 
 function test_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then
+    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
         cat <<EOF
     ========================================
     Testing fluid library for inference ...
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 9c02e0f41b..4a0c1f8cb6 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -78,7 +78,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
                     six.iteritems(word_dict), key=lambda x: x[1],
                     reverse=True)):
             if idx + 3 == dict_size: break
-            fout.write("%s\n" % (word[0]))
+            fout.write("%s\n" % (cpt.to_bytes(word[0])))
 
 
 def __load_dict(tar_file, dict_size, lang, reverse=False):
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index bcd4e4f607..737c8be814 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -121,6 +121,9 @@ def __bootstrap__():
         read_env_flags.append('rpc_server_profile_period')
         read_env_flags.append('rpc_server_profile_path')
         read_env_flags.append('enable_rpc_profiler')
+        read_env_flags.append('rpc_send_thread_num')
+        read_env_flags.append('rpc_get_thread_num')
+        read_env_flags.append('rpc_prefetch_thread_num')
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 4c24d0d6a7..1738afe93e 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -272,7 +272,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                 )
 
         square = grad * grad
-        local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64')
+        local_norm_var = layers.reduce_sum(input=square)
         context[self.group_name].append(local_norm_var)
 
         self.context = context
@@ -282,7 +282,6 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
         if group_scale_name not in self.context:
             group_norm_var = layers.sums(input=self.context[self.group_name])
             group_norm_var = layers.sqrt(x=group_norm_var)
-            group_norm_var = layers.cast(group_norm_var, 'float32')
             clip_var = self.context[self.group_name + "_clip"]
             group_scale_var = layers.elementwise_div(
                 x=clip_var,
@@ -333,7 +332,8 @@ def append_gradient_clip_ops(param_grads):
     for p, g in param_grads:
         if g is None:
             continue
-        with p.block.program._optimized_guard([p, g]):
+        with p.block.program._optimized_guard(
+            [p, g]), framework.name_scope('append_clip'):
             clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
             if clip_attr is None:
                 clip_attr = NullGradientClipAttr()
@@ -348,7 +348,8 @@ def append_gradient_clip_ops(param_grads):
     for p, g in param_grads:
         if g is None:
             continue
-        with p.block.program._optimized_guard([p, g]):
+        with p.block.program._optimized_guard(
+            [p, g]), framework.name_scope('append_graident_clip'):
             res.append(clip_attr._create_operators(param=p, grad=g))
 
     return res
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 7a82038ff7..c84dd4bc47 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -316,7 +316,7 @@ class DetectionMAP(Evaluator):
         gt_label (Variable): The ground truth label index, which is a LoDTensor
             with shape [N, 1].
         gt_box (Variable): The ground truth bounding box (bbox), which is a
-            LoDTensor with shape [N, 6]. The layout is [xmin, ymin, xmax, ymax].
+            LoDTensor with shape [N, 4]. The layout is [xmin, ymin, xmax, ymax].
         gt_difficult (Variable|None): Whether this ground truth is a difficult
             bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
             it means all the ground truth labels are not difficult bbox.
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index b07d0131a3..fd03dff386 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1496,6 +1496,9 @@ class Program(object):
             >>> with program._optimized_guard([p,g]):
             >>>     p = p - 0.001 * g
         """
+        tmp_role = self._current_role
+        tmp_var = self._op_role_var
+
         OpRole = core.op_proto_and_checker_maker.OpRole
         self._current_role = OpRole.Optimize
         self._op_role_var = [
@@ -1503,11 +1506,11 @@ class Program(object):
             for var in param_and_grads
         ]
         yield
-        self._op_role_var = []
-        self._current_role = OpRole.Forward
+        self._op_role_var = tmp_var
+        self._current_role = tmp_role
 
     @contextlib.contextmanager
-    def _lr_schedule_guard(self):
+    def _lr_schedule_guard(self, is_with_opt=False):
         """
         A with guard to set :code:`LRSched` :code:`OpRole` and
         :code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
@@ -1515,6 +1518,10 @@ class Program(object):
 
         Notes: This is a very low level API. Users should not use it directly.
 
+        Args:
+            is_with_opt: Only set to true if these ops a in the middle
+                 of a bunch of optimize ops so that it can be treated
+                 correctly. For example, sgd->lr_op->sgd->lr_op->sgd.
 
         Examples:
 
@@ -1528,6 +1535,8 @@ class Program(object):
 
         OpRole = core.op_proto_and_checker_maker.OpRole
         self._current_role = OpRole.LRSched
+        if is_with_opt:
+            self._current_role = int(OpRole.LRSched) | int(OpRole.Optimize)
         # TODO(typhoonzero): how to set target learning rate var
         self._op_role_var = []
         yield
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 604f3eacd7..22c60c1cbe 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -884,12 +884,13 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
 
     load_prog = Program()
     load_block = load_prog.global_block()
+    need_delete_vars = []
 
     for var_tuple in slice_vars_and_attrs:
         orig_var = var_tuple[0]
         start = var_tuple[1]
         slice_var = var_tuple[2]
-        end = start + reduce(lambda x, y: x * y, slice_var.shape)
+        end = start + slice_var.shape[0]
 
         clone_orig_var = load_block.create_var(
             name=orig_var.name,
@@ -917,5 +918,8 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
             attrs={'axes': [0],
                    'starts': [start],
                    'ends': [end]})
-
+        need_delete_vars.append(clone_orig_var)
+    load_block.append_op(
+        type='delete_var',
+        inputs={'X': need_delete_vars}, )
     executor.run(load_prog)
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index bd9727b6ac..dc317de9ab 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -324,10 +324,19 @@ class LayerHelper(object):
             raise ValueError("no Parameter name %s found" % name)
         return param
 
-    def create_tmp_variable(self, dtype, stop_gradient=False):
+    def create_variable_for_type_inference(self, dtype, stop_gradient=False):
+        """Create a temporary variable that should be type inferred layer.
+
+        Note:
+            The default type will be set to LOD_TENSOR. However, when
+            the var is used as operator output, its type will be updated
+            based on operator's `VarTypeInference` implementation in
+            infer_var_type.
+        """
         return self.main_program.current_block().create_var(
             name=unique_name.generate(".".join([self.name, 'tmp'])),
             dtype=dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
             persistable=False,
             stop_gradient=stop_gradient)
 
@@ -388,7 +397,7 @@ class LayerHelper(object):
 
         b = self.create_parameter(
             attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
-        tmp = self.create_tmp_variable(dtype=input_var.dtype)
+        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
         self.append_op(
             type='elementwise_add',
             inputs={'X': [input_var],
@@ -414,7 +423,7 @@ class LayerHelper(object):
         tmp = input_var
         # NOTE(dzhwinter): some activation support inplace compution.
         if not core.IsInplace(act_type):
-            tmp = self.create_tmp_variable(dtype=input_var.dtype)
+            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
         self.append_op(
             type=act_type,
             inputs={"X": [input_var]},
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 4af97e8632..9730fbf510 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -80,8 +80,8 @@ def split_lod_tensor(input, mask, level=0):
 
     """
     helper = LayerHelper('split_lod_tensor', **locals())
-    out_true = helper.create_tmp_variable(dtype=input.dtype)
-    out_false = helper.create_tmp_variable(dtype=input.dtype)
+    out_true = helper.create_variable_for_type_inference(dtype=input.dtype)
+    out_false = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type='split_lod_tensor',
         inputs={
@@ -131,7 +131,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
                 in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
     """
     helper = LayerHelper('merge_lod_tensor', **locals())
-    out = helper.create_tmp_variable(dtype=in_true.dtype)
+    out = helper.create_variable_for_type_inference(dtype=in_true.dtype)
     helper.append_op(
         type='merge_lod_tensor',
         inputs={'X': x,
@@ -524,7 +524,7 @@ class StaticRNN(object):
         if not isinstance(o, Variable):
             raise TypeError("step output takes a Variable")
 
-        tmp_o = self.helper.create_tmp_variable(dtype=o.dtype)
+        tmp_o = self.helper.create_variable_for_type_inference(dtype=o.dtype)
         self.helper.append_op(
             type='rnn_memory_helper',
             inputs={'X': [o]},
@@ -606,7 +606,8 @@ class StaticRNN(object):
             pre_memories.append(mem.pre_mem.name)
             mem_var = rnn_block.var(mem.mem.name)
             assert isinstance(mem_var, Variable)
-            new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype)
+            new_mem = self.helper.create_variable_for_type_inference(
+                dtype=mem_var.dtype)
 
             rnn_block.append_op(
                 type='rnn_memory_helper',
@@ -813,7 +814,7 @@ def max_sequence_len(rank_table):
         ${out_comment}.
     """
     helper = LayerHelper("max_seqence_len", **locals())
-    res = helper.create_tmp_variable(dtype="int64")
+    res = helper.create_variable_for_type_inference(dtype="int64")
     helper.append_op(
         type="max_sequence_len",
         inputs={"RankTable": rank_table},
@@ -884,7 +885,7 @@ def array_to_lod_tensor(x, table):
           lod_tensor = fluid.layers.array_to_lod_tensor(array, table)
     """
     helper = LayerHelper("array_to_lod_tensor", **locals())
-    tmp = helper.create_tmp_variable(dtype=x.dtype)
+    tmp = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type="array_to_lod_tensor",
         inputs={'X': x,
@@ -915,7 +916,7 @@ def increment(x, value=1.0, in_place=True):
     """
     helper = LayerHelper("increment", **locals())
     if not in_place:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = x
     helper.append_op(
@@ -1012,7 +1013,7 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
     """
     helper = LayerHelper("less_than", **locals())
     if cond is None:
-        cond = helper.create_tmp_variable(dtype='bool')
+        cond = helper.create_variable_for_type_inference(dtype='bool')
         cond.stop_gradient = True
 
     attrs = dict()
@@ -1051,7 +1052,7 @@ def equal(x, y, cond=None, **ignored):
     """
     helper = LayerHelper("equal", **locals())
     if cond is None:
-        cond = helper.create_tmp_variable(dtype='bool')
+        cond = helper.create_variable_for_type_inference(dtype='bool')
         cond.stop_gradient = True
 
     helper.append_op(
@@ -1098,7 +1099,7 @@ def array_read(array, i):
             array,
             Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         raise TypeError("array should be tensor array vairable")
-    out = helper.create_tmp_variable(dtype=array.dtype)
+    out = helper.create_variable_for_type_inference(dtype=array.dtype)
     helper.append_op(
         type='read_from_array',
         inputs={'X': [array],
@@ -1133,7 +1134,7 @@ def shrink_memory(x, i, table):
         usage.
     """
     helper = LayerHelper('shrink_memory', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='shrink_rnn_memory',
         inputs={'X': [x],
@@ -1170,7 +1171,7 @@ def array_length(array):
 
     """
     helper = LayerHelper('array_length', **locals())
-    tmp = helper.create_tmp_variable(dtype='int64')
+    tmp = helper.create_variable_for_type_inference(dtype='int64')
     tmp.stop_gradient = True
     helper.append_op(
         type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
@@ -1585,12 +1586,11 @@ class DynamicRNN(object):
         self.lod_rank_table = None
         self.max_seq_len = None
         self.step_idx = None
-        self.zero_idx = fill_constant(
-            shape=[1], value=0, dtype='int64', force_cpu=True)
+        self.zero_idx = None
         self.mem_dict = dict()
         self.output_array = []
         self.outputs = []
-        self.cond = self.helper.create_tmp_variable(dtype='bool')
+        self.cond = self.helper.create_variable_for_type_inference(dtype='bool')
         self.cond.stop_gradient = False
         self.while_op = While(self.cond)
         self.input_array = []
@@ -1791,6 +1791,7 @@ class DynamicRNN(object):
 
         """
         self._assert_in_rnn_block_('memory')
+        self._init_zero_idx_()
         if init is not None:
             if not isinstance(init, Variable):
                 raise TypeError(
@@ -1904,6 +1905,22 @@ class DynamicRNN(object):
             array_write(x=each, i=self.step_idx, array=outside_array)
             self.output_array.append(outside_array)
 
+    def _init_zero_idx_(self):
+        if self.zero_idx is None:
+            parent_block = self._parent_block_()
+            self.zero_idx = parent_block.create_var(
+                name=unique_name.generate('zero_idx'), dtype='int64')
+            parent_block.append_op(
+                type='fill_constant',
+                inputs={},
+                outputs={'Out': [self.zero_idx]},
+                attrs={
+                    'shape': [1],
+                    'dtype': self.zero_idx.dtype,
+                    'value': float(0),
+                    'force_cpu': True
+                })
+
     def _parent_block_(self):
         prog = self.helper.main_program
         parent_idx = prog.current_block().parent_idx
@@ -1924,7 +1941,7 @@ def reorder_lod_tensor_by_rank(x, rank_table):
     helper.is_instance('x', Variable)
     helper.is_instance('rank_table', Variable)
 
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='reorder_lod_tensor_by_rank',
         inputs={'X': [x],
@@ -1958,7 +1975,7 @@ def is_empty(x, cond=None, **ignored):
     """
     helper = LayerHelper("is_empty", **locals())
     if cond is None:
-        cond = helper.create_tmp_variable(dtype='bool')
+        cond = helper.create_variable_for_type_inference(dtype='bool')
         cond.stop_gradient = True
     elif not isinstance(cond, Variable):
         raise TypeError("cond takes a variable")
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 1cfcbbb9c1..4ac94981a7 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -116,8 +116,8 @@ def rpn_target_assign(bbox_pred,
     Returns:
         tuple:
                A tuple(predicted_scores, predicted_location, target_label,
-               target_bbox) is returned. The predicted_scores and
-               predicted_location is the predicted result of the RPN.
+               target_bbox, bbox_inside_weight) is returned. The predicted_scores 
+               and predicted_location is the predicted result of the RPN.
                The target_label and target_bbox is the ground truth,
                respectively. The predicted_location is a 2D Tensor with shape
                [F, 4], and the shape of target_bbox is same as the shape of
@@ -126,6 +126,8 @@ def rpn_target_assign(bbox_pred,
                [F + B, 1], and the shape of target_label is same as the shape
                of the predicted_scores, B is the number of the background
                anchors, the F and B is depends on the input of this operator.
+               Bbox_inside_weight represents whether the predicted loc is fake_fg
+               or not and the shape is [F, 4].
 
     Examples:
         .. code-block:: python
@@ -138,7 +140,7 @@ def rpn_target_assign(bbox_pred,
                           append_batch_size=False, dtype='float32')
         gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
                          append_batch_size=False, dtype='float32')
-        loc_pred, score_pred, loc_target, score_target =
+        loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
             fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
                                           cls_logits=cls_logits,
                                           anchor_box=anchor_box,
@@ -147,10 +149,13 @@ def rpn_target_assign(bbox_pred,
 
     helper = LayerHelper('rpn_target_assign', **locals())
     # Assign target label to anchors
-    loc_index = helper.create_tmp_variable(dtype='int32')
-    score_index = helper.create_tmp_variable(dtype='int32')
-    target_label = helper.create_tmp_variable(dtype='int32')
-    target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    loc_index = helper.create_variable_for_type_inference(dtype='int32')
+    score_index = helper.create_variable_for_type_inference(dtype='int32')
+    target_label = helper.create_variable_for_type_inference(dtype='int32')
+    target_bbox = helper.create_variable_for_type_inference(
+        dtype=anchor_box.dtype)
+    bbox_inside_weight = helper.create_variable_for_type_inference(
+        dtype=anchor_box.dtype)
     helper.append_op(
         type="rpn_target_assign",
         inputs={
@@ -163,7 +168,8 @@ def rpn_target_assign(bbox_pred,
             'LocationIndex': loc_index,
             'ScoreIndex': score_index,
             'TargetLabel': target_label,
-            'TargetBBox': target_bbox
+            'TargetBBox': target_bbox,
+            'BBoxInsideWeight': bbox_inside_weight
         },
         attrs={
             'rpn_batch_size_per_im': rpn_batch_size_per_im,
@@ -178,13 +184,14 @@ def rpn_target_assign(bbox_pred,
     score_index.stop_gradient = True
     target_label.stop_gradient = True
     target_bbox.stop_gradient = True
+    bbox_inside_weight.stop_gradient = True
 
     cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1))
     bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
     predicted_cls_logits = nn.gather(cls_logits, score_index)
     predicted_bbox_pred = nn.gather(bbox_pred, loc_index)
 
-    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox
+    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox, bbox_inside_weight
 
 
 def detection_output(loc,
@@ -282,7 +289,8 @@ def detection_output(loc,
     scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
     scores.stop_gradient = True
-    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
+    nmsed_outs = helper.create_variable_for_type_inference(
+        dtype=decoded_box.dtype)
     helper.append_op(
         type="multiclass_nms",
         inputs={'Scores': scores,
@@ -314,7 +322,7 @@ def iou_similarity(x, y, name=None):
     """
     helper = LayerHelper("iou_similarity", **locals())
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -351,7 +359,8 @@ def box_coder(prior_box,
     helper = LayerHelper("box_coder", **locals())
 
     if name is None:
-        output_box = helper.create_tmp_variable(dtype=prior_box.dtype)
+        output_box = helper.create_variable_for_type_inference(
+            dtype=prior_box.dtype)
     else:
         output_box = helper.create_variable(
             name=name, dtype=prior_box.dtype, persistable=False)
@@ -382,7 +391,7 @@ def polygon_box_transform(input, name=None):
     """
     helper = LayerHelper("polygon_box_transform", **locals())
     if name is None:
-        output = helper.create_tmp_variable(dtype=input.dtype)
+        output = helper.create_variable_for_type_inference(dtype=input.dtype)
     else:
         output = helper.create_variable(
             name=name, dtype=prior_box.input, persistable=False)
@@ -450,7 +459,7 @@ def detection_map(detect_res,
     helper = LayerHelper("detection_map", **locals())
 
     def __create_var(type):
-        return helper.create_tmp_variable(dtype=type)
+        return helper.create_variable_for_type_inference(dtype=type)
 
     map_out = __create_var('float32')
     accum_pos_count_out = out_states[0] if out_states else __create_var('int32')
@@ -557,8 +566,9 @@ def bipartite_match(dist_matrix,
         >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
     """
     helper = LayerHelper('bipartite_match', **locals())
-    match_indices = helper.create_tmp_variable(dtype='int32')
-    match_distance = helper.create_tmp_variable(dtype=dist_matrix.dtype)
+    match_indices = helper.create_variable_for_type_inference(dtype='int32')
+    match_distance = helper.create_variable_for_type_inference(
+        dtype=dist_matrix.dtype)
     helper.append_op(
         type='bipartite_match',
         inputs={'DistMat': dist_matrix},
@@ -644,8 +654,8 @@ def target_assign(input,
                             gt, matched_indices, mismatch_value=0)
     """
     helper = LayerHelper('target_assign', **locals())
-    out = helper.create_tmp_variable(dtype=input.dtype)
-    out_weight = helper.create_tmp_variable(dtype='float32')
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    out_weight = helper.create_variable_for_type_inference(dtype='float32')
     helper.append_op(
         type='target_assign',
         inputs={
@@ -816,9 +826,10 @@ def ssd_loss(location,
     conf_loss = nn.reshape(
         x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape)
     conf_loss.stop_gradient = True
-    neg_indices = helper.create_tmp_variable(dtype='int32')
+    neg_indices = helper.create_variable_for_type_inference(dtype='int32')
     dtype = matched_indices.dtype
-    updated_matched_indices = helper.create_tmp_variable(dtype=dtype)
+    updated_matched_indices = helper.create_variable_for_type_inference(
+        dtype=dtype)
     helper.append_op(
         type='mine_hard_examples',
         inputs={
@@ -998,8 +1009,8 @@ def prior_box(input,
             max_sizes = [max_sizes]
         attrs['max_sizes'] = max_sizes
 
-    box = helper.create_tmp_variable(dtype)
-    var = helper.create_tmp_variable(dtype)
+    box = helper.create_variable_for_type_inference(dtype)
+    var = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="prior_box",
         inputs={"Input": input,
@@ -1337,8 +1348,8 @@ def anchor_generator(input,
         'offset': offset
     }
 
-    anchor = helper.create_tmp_variable(dtype)
-    var = helper.create_tmp_variable(dtype)
+    anchor = helper.create_variable_for_type_inference(dtype)
+    var = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="anchor_generator",
         inputs={"Input": input},
@@ -1384,7 +1395,7 @@ def roi_perspective_transform(input,
     """
     helper = LayerHelper('roi_perspective_transform', **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="roi_perspective_transform",
         inputs={"X": input,
@@ -1413,16 +1424,49 @@ def generate_proposal_labels(rpn_rois,
                              use_random=True):
     """
     ** Generate proposal labels Faster-RCNN **
-    TODO(buxingyuan): Add Document
+    This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
+    to sample foreground boxes and background boxes, and compute loss target.
+
+    RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes
+    were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction,
+    If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample.
+    If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi,
+    then it was considered as a background sample.
+    After all foreground and background boxes are chosen (so called Rois),
+    then we apply random sampling to make sure
+    the number of foreground boxes is no more than batch_size_per_im * fg_fraction.
+
+    For each box in Rois, we assign the classification (class label) and regression targets (box label) to it.
+    Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss.
+
+    Args:
+        rpn_rois(Variable): A 2-D LoDTensor with shape [N, 4]. N is the number of the GenerateProposalOp's output, each element is a bounding box with [xmin, ymin, xmax, ymax] format.
+        gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a class label of groundtruth.
+        is_crowd(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a flag indicates whether a groundtruth is crowd.
+        gt_boxes(Variable): A 2-D LoDTensor with shape [M, 4]. M is the number of groundtruth, each element is a bounding box with [xmin, ymin, xmax, ymax] format.
+        im_info(Variable): A 2-D LoDTensor with shape [B, 3]. B is the number of input images, each element consists of im_height, im_width, im_scale.
+
+        batch_size_per_im(int): Batch size of rois per images.
+        fg_fraction(float): Foreground fraction in total batch_size_per_im.
+        fg_thresh(float): Overlap threshold which is used to chose foreground sample.
+        bg_thresh_hi(float): Overlap threshold upper bound which is used to chose background sample.
+        bg_thresh_lo(float): Overlap threshold lower bound which is used to chose background sample.
+        bbox_reg_weights(list|tuple): Box regression weights.
+        class_nums(int): Class number.
+        use_random(bool): Use random sampling to choose foreground and background boxes.
     """
 
     helper = LayerHelper('generate_proposal_labels', **locals())
 
-    rois = helper.create_tmp_variable(dtype=rpn_rois.dtype)
-    labels_int32 = helper.create_tmp_variable(dtype=gt_classes.dtype)
-    bbox_targets = helper.create_tmp_variable(dtype=rpn_rois.dtype)
-    bbox_inside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype)
-    bbox_outside_weights = helper.create_tmp_variable(dtype=rpn_rois.dtype)
+    rois = helper.create_variable_for_type_inference(dtype=rpn_rois.dtype)
+    labels_int32 = helper.create_variable_for_type_inference(
+        dtype=gt_classes.dtype)
+    bbox_targets = helper.create_variable_for_type_inference(
+        dtype=rpn_rois.dtype)
+    bbox_inside_weights = helper.create_variable_for_type_inference(
+        dtype=rpn_rois.dtype)
+    bbox_outside_weights = helper.create_variable_for_type_inference(
+        dtype=rpn_rois.dtype)
 
     helper.append_op(
         type="generate_proposal_labels",
@@ -1472,7 +1516,7 @@ def generate_proposals(scores,
                        eta=1.0,
                        name=None):
     """
-    ** Generate proposal labels Faster-RCNN **
+    ** Generate proposal Faster-RCNN **
 	
 	This operation proposes RoIs according to each box with their probability to be a foreground object and 
 	the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
@@ -1504,8 +1548,10 @@ def generate_proposals(scores,
     """
     helper = LayerHelper('generate_proposals', **locals())
 
-    rpn_rois = helper.create_tmp_variable(dtype=bbox_deltas.dtype)
-    rpn_roi_probs = helper.create_tmp_variable(dtype=scores.dtype)
+    rpn_rois = helper.create_variable_for_type_inference(
+        dtype=bbox_deltas.dtype)
+    rpn_roi_probs = helper.create_variable_for_type_inference(
+        dtype=scores.dtype)
     helper.append_op(
         type="generate_proposals",
         inputs={
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index dcd5a064a8..95e13669ad 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -954,7 +954,7 @@ def read_file(reader):
     """
     helper = LayerHelper('read_file')
     out = [
-        helper.create_tmp_variable(
+        helper.create_variable_for_type_inference(
             stop_gradient=True, dtype='float32')
         for _ in range(len(reader.desc.shapes()))
     ]
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 8c11921d9b..eea0a362a0 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -202,10 +202,12 @@ def generate_layer_fn(op_type):
             out_var = out[0] if (isinstance(out, list) or
                                  isinstance(out, tuple)) else out
         else:
-            out_var = helper.create_tmp_variable(dtype=dtype)
+            out_var = helper.create_variable_for_type_inference(dtype=dtype)
         outputs[o_name] = [out_var]
         for name in intermediate_output_names:
-            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
+            outputs[name] = [
+                helper.create_variable_for_type_inference(dtype=dtype)
+            ]
         helper.append_op(
             type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
         return helper.append_activation(out_var)
@@ -229,7 +231,7 @@ def generate_layer_fn_noattr(op_type):
 
     def func(x, name=None):
         helper = LayerHelper(op_type, **locals())
-        output = helper.create_tmp_variable(dtype=x.dtype)
+        output = helper.create_variable_for_type_inference(dtype=x.dtype)
         helper.append_op(type=op_type, inputs={"X": x}, outputs={"Out": output})
         return output
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index dfd801a098..149224bb68 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -27,7 +27,7 @@ from . import nn
 from . import ops
 from . import tensor
 from ..initializer import init_on_cpu
-from ..framework import default_main_program, Parameter, unique_name
+from ..framework import default_main_program, Parameter, unique_name, name_scope
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
@@ -332,14 +332,16 @@ def append_LARS(params_grads, learning_rate, weight_decay):
             return grad_norm + weight_decay * param_norm
 
     for param, grad in params_grads:
-        param_lr = param.optimize_attr['learning_rate']
-        param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
-        grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
-        if type(param_lr) == float and param_lr == 1.0:
-            decayed_lr = learning_rate * param_norm \
-                / _balanced_weight(param_norm, grad_norm)
-        else:
-            decayed_lr = learning_rate * param_lr * param_norm \
-                / _balanced_weight(param_norm, grad_norm)
-        # set back param local learning rate
-        param.optimize_attr['learning_rate'] = decayed_lr
+        with param.block.program.optimized_guard(
+            [param, grad]), name_scope("optimizer"):
+            param_lr = param.optimize_attr['learning_rate']
+            param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
+            grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
+            if type(param_lr) == float and param_lr == 1.0:
+                decayed_lr = learning_rate * param_norm \
+                    / _balanced_weight(param_norm, grad_norm)
+            else:
+                decayed_lr = learning_rate * param_lr * param_norm \
+                    / _balanced_weight(param_norm, grad_norm)
+            # set back param local learning rate
+            param.optimize_attr['learning_rate'] = decayed_lr
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index a3064b565d..b2d2c93ead 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -58,11 +58,11 @@ def accuracy(input, label, k=1, correct=None, total=None):
     """
     helper = LayerHelper("accuracy", **locals())
     topk_out, topk_indices = nn.topk(input, k=k)
-    acc_out = helper.create_tmp_variable(dtype="float32")
+    acc_out = helper.create_variable_for_type_inference(dtype="float32")
     if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
+        correct = helper.create_variable_for_type_inference(dtype="int64")
     if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
+        total = helper.create_variable_for_type_inference(dtype="int64")
     helper.append_op(
         type="accuracy",
         inputs={
@@ -124,8 +124,8 @@ def auc(input,
             auc_out=fluid.layers.auc(input=prediction, label=label)
     """
     helper = LayerHelper("auc", **locals())
-    auc_out = helper.create_tmp_variable(dtype="float64")
-    batch_auc_out = helper.create_tmp_variable(dtype="float64")
+    auc_out = helper.create_variable_for_type_inference(dtype="float64")
+    batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
     # make tp, tn, fp, fn persistable, so that can accumulate all batches.
 
     # for batch auc
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 538035de1a..d15b85d536 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -154,7 +154,13 @@ __all__ = [
     'mul',
     'sigmoid_cross_entropy_with_logits',
     'maxout',
+    'affine_grid',
+    'sequence_reverse',
     'affine_channel',
+    'hash',
+    'grid_sampler',
+    'log_loss',
+    'add_position_encoding',
 ]
 
 
@@ -242,7 +248,7 @@ def fc(input,
 
         w = helper.create_parameter(
             attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
-        tmp = helper.create_tmp_variable(dtype)
+        tmp = helper.create_variable_for_type_inference(dtype)
         helper.append_op(
             type="mul",
             inputs={"X": input_var,
@@ -255,7 +261,7 @@ def fc(input,
     if len(mul_results) == 1:
         pre_bias = mul_results[0]
     else:
-        pre_bias = helper.create_tmp_variable(dtype)
+        pre_bias = helper.create_variable_for_type_inference(dtype)
         helper.append_op(
             type="sum",
             inputs={"X": mul_results},
@@ -314,7 +320,7 @@ def embedding(input,
     helper = LayerHelper('embedding', **locals())
     w = helper.create_parameter(
         attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
-    tmp = helper.create_tmp_variable(dtype)
+    tmp = helper.create_variable_for_type_inference(dtype)
     padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
         size[0] + padding_idx)
     helper.append_op(
@@ -418,10 +424,10 @@ def dynamic_lstm(input,
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
 
-    hidden = helper.create_tmp_variable(dtype)
-    cell = helper.create_tmp_variable(dtype)
-    batch_gate = helper.create_tmp_variable(dtype)
-    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+    hidden = helper.create_variable_for_type_inference(dtype)
+    cell = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
     inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
     batch_size = input.shape[0]
     if h_0:
@@ -621,12 +627,12 @@ def dynamic_lstmp(input,
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
 
-    projection = helper.create_tmp_variable(dtype)
-    cell = helper.create_tmp_variable(dtype)
-    ordered_proj0 = helper.create_tmp_variable(dtype)
-    batch_hidden = helper.create_tmp_variable(dtype)
-    batch_gate = helper.create_tmp_variable(dtype)
-    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+    projection = helper.create_variable_for_type_inference(dtype)
+    cell = helper.create_variable_for_type_inference(dtype)
+    ordered_proj0 = helper.create_variable_for_type_inference(dtype)
+    batch_hidden = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type='lstmp',
@@ -706,8 +712,18 @@ def dynamic_gru(input,
               The first part are weights of the update gate and reset gate with
               shape :math:`(D \\times 2D)`, and the second part are weights for
               candidate hidden state with shape :math:`(D \\times D)`.
-        bias_attr(ParamAttr): The parameter attribute for learnable the
-            hidden-hidden bias.
+
+            If it is set to None or one attribute of ParamAttr, dynamic_gru will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
+            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates 
+            the bias in the update gate, reset gate and candidate calculations.
+            If it is set to False, no bias will be applied to the update gate, 
+            reset gate and candidate calculations. If it is set to None or one 
+            attribute of ParamAttr, dynamic_gru will create ParamAttr as 
+            bias_attr. If the Initializer of the bias_attr is not set, the bias
+            is initialized zero. Default: None.
         is_reverse(bool): Whether to compute reversed GRU, default
             :attr:`False`.
         gate_activation(str): The activation for update gate and reset gate.
@@ -745,16 +761,16 @@ def dynamic_gru(input,
         attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
     batch_size = input.shape[0]
     inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-    if h_0 != None:
+    if h_0:
         assert h_0.shape == (
             batch_size, size
         ), 'The shape of h0 should be(batch_size, %d)' % size
         inputs['H0'] = h_0
 
-    hidden = helper.create_tmp_variable(dtype)
-    batch_gate = helper.create_tmp_variable(dtype)
-    batch_reset_hidden_prev = helper.create_tmp_variable(dtype)
-    batch_hidden = helper.create_tmp_variable(dtype)
+    hidden = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_reset_hidden_prev = helper.create_variable_for_type_inference(dtype)
+    batch_hidden = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type='gru',
@@ -806,10 +822,29 @@ def gru_unit(input,
 
     Args:
         input (Variable): The fc transformed input value of current step.
-        hidden (Variable): The hidden value of lstm unit from previous step.
+        hidden (Variable): The hidden value of gru unit from previous step.
         size (integer): The input dimension value.
-        param_attr (ParamAttr): The weight parameters for gru unit. Default: None
-        bias_attr (ParamAttr): The bias parameters for gru unit. Default: None
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            hidden-hidden weight matrix. Note:
+
+            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
+              :math:`D` is the hidden size.
+            - All elements in the weight matrix can be divided into two parts.
+              The first part are weights of the update gate and reset gate with
+              shape :math:`(D \\times 2D)`, and the second part are weights for
+              candidate hidden state with shape :math:`(D \\times D)`.
+
+            If it is set to None or one attribute of ParamAttr, gru_unit will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
+            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates 
+            the bias in the update gate, reset gate and candidate calculations.
+            If it is set to False, no bias will be applied to the update gate, 
+            reset gate and candidate calculations. If it is set to None or one 
+            attribute of ParamAttr, gru_unit will create ParamAttr as 
+            bias_attr. If the Initializer of the bias_attr is not set, the bias
+            is initialized zero. Default: None.
         activation (string): The activation type for cell (actNode).
                              Default: 'tanh'
         gate_activation (string): The activation type for gates (actGate).
@@ -844,9 +879,9 @@ def gru_unit(input,
     weight = helper.create_parameter(
         attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
 
-    gate = helper.create_tmp_variable(dtype)
-    reset_hidden_pre = helper.create_tmp_variable(dtype)
-    updated_hidden = helper.create_tmp_variable(dtype)
+    gate = helper.create_variable_for_type_inference(dtype)
+    reset_hidden_pre = helper.create_variable_for_type_inference(dtype)
+    updated_hidden = helper.create_variable_for_type_inference(dtype)
     inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
     # create bias
     if helper.bias_attr:
@@ -896,10 +931,14 @@ def linear_chain_crf(input, label, param_attr=None):
         attr=helper.param_attr,
         shape=[size + 2, size],
         dtype=helper.input_dtype())
-    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
-    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
-    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
-    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
+    alpha = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    emission_exps = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    transition_exps = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    log_likelihood = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
     helper.append_op(
         type='linear_chain_crf',
         inputs={"Emission": [input],
@@ -938,7 +977,8 @@ def crf_decoding(input, param_attr, label=None):
     """
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
-    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
+    viterbi_path = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
     helper.append_op(
         type='crf_decoding',
         inputs={"Emission": [input],
@@ -962,9 +1002,9 @@ def cos_sim(X, Y):
         Variable: the output of cosine(X, Y).
     """
     helper = LayerHelper('cos_sim', **locals())
-    out = helper.create_tmp_variable(dtype=X.dtype)
-    xnorm = helper.create_tmp_variable(dtype=X.dtype)
-    ynorm = helper.create_tmp_variable(dtype=X.dtype)
+    out = helper.create_variable_for_type_inference(dtype=X.dtype)
+    xnorm = helper.create_variable_for_type_inference(dtype=X.dtype)
+    ynorm = helper.create_variable_for_type_inference(dtype=X.dtype)
     helper.append_op(
         type='cos_sim',
         inputs={'X': [X],
@@ -975,7 +1015,12 @@ def cos_sim(X, Y):
     return out
 
 
-def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
+def dropout(x,
+            dropout_prob,
+            is_test=False,
+            seed=None,
+            name=None,
+            dropout_implementation="downgrade_in_infer"):
     """
     Computes dropout.
 
@@ -995,6 +1040,21 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
                     units will be dropped. DO NOT use a fixed seed in training.
         name (str|None): A name for this layer(optional). If set None, the layer
                          will be named automatically.
+        dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train']
+                                        1. downgrade_in_infer(default), downgrade the outcome at inference
+                                           train: out = input * mask
+                                           inference: out = input * dropout_prob
+                                           (make is a tensor same shape with input, value is 0 or 1
+                                            ratio of 0 is dropout_prob)
+                                        2. upscale_in_train, upscale the outcome at training time
+                                           train: out = input * mask / ( 1.0 - dropout_prob )
+                                           inference: out = input
+                                           (make is a tensor same shape with input, value is 0 or 1
+                                            ratio of 0 is dropout_prob)
+                                           dropout op can be removed from the program. 
+                                           the program will be efficient
+                                        
+
 
     Returns:
         Variable: A tensor variable is the shape with `x`.
@@ -1008,8 +1068,9 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     """
 
     helper = LayerHelper('dropout', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    mask = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
 
     if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
         seed = helper.main_program.random_seed
@@ -1023,7 +1084,8 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
             'dropout_prob': dropout_prob,
             'is_test': is_test,
             'fix_seed': seed is not None,
-            'seed': seed if seed is not None else 0
+            'seed': seed if seed is not None else 0,
+            'dropout_implementation': dropout_implementation,
         })
     return out
 
@@ -1094,7 +1156,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
           cost = fluid.layers.cross_entropy(input=predict, label=label)
     """
     helper = LayerHelper('cross_entropy', **locals())
-    out = helper.create_tmp_variable(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type='cross_entropy',
         inputs={'X': [input],
@@ -1141,14 +1203,14 @@ def square_error_cost(input, label):
 
     """
     helper = LayerHelper('square_error_cost', **locals())
-    minus_out = helper.create_tmp_variable(dtype=input.dtype)
+    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type='elementwise_sub',
         inputs={'X': [input],
                 'Y': [label]},
         outputs={'Out': [minus_out]})
 
-    square_out = helper.create_tmp_variable(dtype=input.dtype)
+    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type='square', inputs={'X': [minus_out]},
         outputs={'Out': [square_out]})
@@ -1254,12 +1316,13 @@ def chunk_eval(input,
     helper = LayerHelper("chunk_eval", **locals())
 
     # prepare output
-    precision = helper.create_tmp_variable(dtype="float32")
-    recall = helper.create_tmp_variable(dtype="float32")
-    f1_score = helper.create_tmp_variable(dtype="float32")
-    num_infer_chunks = helper.create_tmp_variable(dtype="int64")
-    num_label_chunks = helper.create_tmp_variable(dtype="int64")
-    num_correct_chunks = helper.create_tmp_variable(dtype="int64")
+    precision = helper.create_variable_for_type_inference(dtype="float32")
+    recall = helper.create_variable_for_type_inference(dtype="float32")
+    f1_score = helper.create_variable_for_type_inference(dtype="float32")
+    num_infer_chunks = helper.create_variable_for_type_inference(dtype="int64")
+    num_label_chunks = helper.create_variable_for_type_inference(dtype="int64")
+    num_correct_chunks = helper.create_variable_for_type_inference(
+        dtype="int64")
 
     helper.append_op(
         type="chunk_eval",
@@ -1326,7 +1389,7 @@ def sequence_conv(input,
     filter_shape = [filter_size * input.shape[1], num_filters]
     filter_param = helper.create_parameter(
         attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-    pre_bias = helper.create_tmp_variable(dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type='sequence_conv',
@@ -1382,7 +1445,7 @@ def sequence_softmax(input, use_cudnn=False, name=None):
     """
     helper = LayerHelper('sequence_softmax', **locals())
     dtype = helper.input_dtype()
-    softmax_out = helper.create_tmp_variable(dtype)
+    softmax_out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="sequence_softmax",
         inputs={"X": input},
@@ -1436,7 +1499,7 @@ def softmax(input, use_cudnn=True, name=None):
     """
     helper = LayerHelper('softmax', **locals())
     dtype = helper.input_dtype()
-    softmax_out = helper.create_tmp_variable(dtype)
+    softmax_out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="softmax",
         inputs={"X": input},
@@ -1599,7 +1662,7 @@ def conv2d(input,
         dtype=dtype,
         default_initializer=_get_default_param_initializer())
 
-    pre_bias = helper.create_tmp_variable(dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type=l_type,
@@ -1770,7 +1833,7 @@ def conv3d(input,
         dtype=dtype,
         default_initializer=_get_default_param_initializer())
 
-    pre_bias = helper.create_tmp_variable(dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type=l_type,
@@ -1793,7 +1856,7 @@ def conv3d(input,
     return helper.append_activation(pre_act)
 
 
-def sequence_pool(input, pool_type):
+def sequence_pool(input, pool_type, is_test=False):
     """
     This function add the operator for sequence pooling.
     It pools features of all time-steps of each instance, and is applied
@@ -1830,6 +1893,7 @@ def sequence_pool(input, pool_type):
         input(variable): The input variable which is a LoDTensor.
         pool_type (string): The pooling type of sequence_pool.
             It supports average, sum, sqrt and max.
+        is_test(bool, Default False): Used distinguish training from scoring mode.
 
     Returns:
         The sequence pooling variable which is a Tensor.
@@ -1849,15 +1913,16 @@ def sequence_pool(input, pool_type):
     """
     helper = LayerHelper('sequence_pool', **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-    max_index = helper.create_tmp_variable(dtype)
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    max_index = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type="sequence_pool",
         inputs={"X": input},
         outputs={"Out": pool_out,
                  "MaxIndex": max_index},
-        attrs={"pooltype": pool_type.upper()})
+        attrs={"pooltype": pool_type.upper(),
+               "is_test": is_test})
 
     # when pool_type is max, variable max_index is initialized,
     # so we stop the gradient explicitly here
@@ -1886,7 +1951,7 @@ def sequence_concat(input, name=None):
            out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3])
     """
     helper = LayerHelper('sequence_concat', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     helper.append_op(
         type='sequence_concat', inputs={'X': input}, outputs={'Out': [out]})
     return out
@@ -1962,17 +2027,17 @@ def sequence_slice(input, offset, length, name=None):
     """
     **Sequence Slice Layer**
 
-    The layer crops a subsequence from given sequence with given start 
+    The layer crops a subsequence from given sequence with given start
     offset and subsequence length.
 
     It only supports sequence data (LoDTensor with lod_level equal to 1).
 
     .. code-block:: text
-    
+
 	- Case:
 
             Given the input Variable **input**:
-                
+
                 input.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]],
                 input.lod = [[3, 2]],
                 input.dims = (5, 2),
@@ -1980,16 +2045,16 @@ def sequence_slice(input, offset, length, name=None):
             with offset.data = [[0], [1]] and length.data = [[2], [1]],
 
             the output Variable will be
-                
+
                 out.data = [[a1, a2], [b1, b2], [e1, e2]],
                 out.lod = [[2, 1]],
                 out.dims = (3, 2).
-	
-    NOTE: The first dimension size of **input**, **offset** and **length** 
+
+    NOTE: The first dimension size of **input**, **offset** and **length**
           should be equal. The **offset** should start from 0.
-    
+
     Args:
-        input(Variable): The input Variable which consists of the complete 
+        input(Variable): The input Variable which consists of the complete
                          sequences.
         offset(Variable): The offset to slice each sequence.
         length(Variable): The length of each subsequence.
@@ -2008,12 +2073,12 @@ def sequence_slice(input, offset, length, name=None):
                               dtype='float32', lod_level=1)
              offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
              length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
-             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset, 
+             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
                                                    length=length)
     """
     helper = LayerHelper("sequence_slice", **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
 
     offset.stop_gradient = True
     length.stop_gradient = True
@@ -2037,7 +2102,8 @@ def pool2d(input,
            global_pooling=False,
            use_cudnn=True,
            ceil_mode=False,
-           name=None):
+           name=None,
+           exclusive=True):
     """
     ${comment}
 
@@ -2051,11 +2117,13 @@ def pool2d(input,
         pool_type: ${pooling_type_comment}
         pool_stride (int): stride of the pooling layer.
         pool_padding (int): padding size.
-        global_pooling: ${global_pooling_comment}
-        use_cudnn: ${use_cudnn_comment}
-        ceil_mode: ${ceil_mode_comment}
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
+        exclusive (bool): Whether to exclude padding points in average pooling 
+                          mode, default is true
 
     Returns:
         Variable: The pooling result.
@@ -2099,7 +2167,7 @@ def pool2d(input,
 
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
+    pool_out = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type=l_type,
@@ -2113,7 +2181,8 @@ def pool2d(input,
             "paddings": pool_padding,
             "use_cudnn": use_cudnn,
             "ceil_mode": ceil_mode,
-            "use_mkldnn": False
+            "use_mkldnn": False,
+            "exclusive": exclusive,
         })
 
     return pool_out
@@ -2127,7 +2196,8 @@ def pool3d(input,
            global_pooling=False,
            use_cudnn=True,
            ceil_mode=False,
-           name=None):
+           name=None,
+           exclusive=True):
     """
     This function adds the operator for pooling in 3-dimensions, using the
     pooling configurations mentioned in input parameters.
@@ -2143,6 +2213,8 @@ def pool3d(input,
         ceil_mode (bool): ${ceil_mode_comment}
         name (str): A name for this layer(optional). If set None, the layer
             will be named automatically.
+        exclusive (bool): Whether to exclude padding points in average pooling 
+                          mode, default is true
 
     Returns:
         Variable: output of pool3d layer.
@@ -2167,7 +2239,7 @@ def pool3d(input,
     l_type = "pool3d"
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
+    pool_out = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type=l_type,
@@ -2181,7 +2253,8 @@ def pool3d(input,
             "paddings": pool_padding,
             "use_cudnn": use_cudnn,
             "ceil_mode": ceil_mode,
-            "use_mkldnn": False
+            "use_mkldnn": False,
+            "exclusive": exclusive,
         })
 
     return pool_out
@@ -2310,10 +2383,13 @@ def batch_norm(input,
     mean_out = mean
     # variance and variance out share the same memory
     variance_out = variance
-    saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
-    saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
 
-    batch_norm_out = input if in_place else helper.create_tmp_variable(dtype)
+    batch_norm_out = input if in_place else helper.create_variable_for_type_inference(
+        dtype)
 
     helper.append_op(
         type="batch_norm",
@@ -2388,12 +2464,12 @@ def layer_norm(input,
         param_attr(ParamAttr|None): The parameter attribute for the learnable
             gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
             omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
-            a default :code:`ParamAttr` would be added as scale. The 
-            :attr:`param_attr` is initialized as 1 if it is added. Default None. 
+            a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default None.
         bias_attr(ParamAttr|None): The parameter attribute for the learnable
             bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
             omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
-            a default :code:`ParamAttr` would be added as bias. The 
+            a default :code:`ParamAttr` would be added as bias. The
             :attr:`bias_attr` is initialized as 0 if it is added. Default None.
         act(str): Activation to be applied to the output of layer normalizaiton.
                   Default None.
@@ -2430,9 +2506,11 @@ def layer_norm(input,
         inputs['Bias'] = bias
 
     # create output
-    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
-    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
-    layer_norm_out = helper.create_tmp_variable(dtype)
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    layer_norm_out = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type="layer_norm",
@@ -2619,7 +2697,7 @@ def conv2d_transpose(input,
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
-    pre_bias = helper.create_tmp_variable(dtype=input.dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type=op_type,
         inputs={'Input': [input],
@@ -2797,7 +2875,7 @@ def conv3d_transpose(input,
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
-    pre_bias = helper.create_tmp_variable(dtype=input.dtype)
+    pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type=l_type,
         inputs={'Input': [input],
@@ -2876,7 +2954,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     """
     helper = LayerHelper('sequence_expand', input=x, **locals())
     dtype = helper.input_dtype()
-    tmp = helper.create_tmp_variable(dtype)
+    tmp = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='sequence_expand',
         inputs={'X': x,
@@ -2942,7 +3020,7 @@ def sequence_expand_as(x, y, name=None):
     """
     helper = LayerHelper('sequence_expand_as', input=x, **locals())
     dtype = helper.input_dtype()
-    tmp = helper.create_tmp_variable(dtype)
+    tmp = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='sequence_expand_as',
         inputs={'X': x,
@@ -2981,14 +3059,15 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
 
             x = fluid.layers.data(name='y', shape=[10, 5],
                              dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(input=numpy.array([0]))
+            pad_value = fluid.layers.assign(
+                input=numpy.array([0], dtype=numpy.float32))
             out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
     """
 
     helper = LayerHelper('sequence_pad', input=x, **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
-    length = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
+    length = helper.create_variable_for_type_inference(dtype)
 
     pad_value.stop_gradient = True
     length.stop_gradient = True
@@ -3009,8 +3088,8 @@ def sequence_unpad(x, length, name=None):
     """
     **Sequence Unpad Layer**
 
-    This layer removes the padding data in the input sequences and convert 
-    them into sequences with actual length as output, identitied by lod 
+    This layer removes the padding data in the input sequences and convert
+    them into sequences with actual length as output, identitied by lod
     information.
 
     .. code-block:: text
@@ -3020,9 +3099,9 @@ def sequence_unpad(x, length, name=None):
 	Given input Variable **x**:
 	    x.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
 		      [ 6.0,  7.0,  8.0,  9.0, 10.0],
-		      [11.0, 12.0, 13.0, 14.0, 15.0]], 
-     
-	in which there are 3 sequences padded to length 5, and the acutal length 
+		      [11.0, 12.0, 13.0, 14.0, 15.0]],
+
+	in which there are 3 sequences padded to length 5, and the acutal length
 	specified by input Variable **length**:
 
 	    length.data = [[2], [3], [4]],
@@ -3030,7 +3109,7 @@ def sequence_unpad(x, length, name=None):
 	after unpadding, the output Variable will be:
 
 	    out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
-	    out.lod = [[2, 3, 4]]      
+	    out.lod = [[2, 3, 4]]
 
     Args:
         x(Variable): Input Variable which contains the padded sequences with
@@ -3053,7 +3132,7 @@ def sequence_unpad(x, length, name=None):
 
     helper = LayerHelper('sequence_unpad', input=x, **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
 
     length.stop_gradient = True
 
@@ -3152,8 +3231,9 @@ def beam_search(pre_ids,
     score_type = scores.dtype
     id_type = ids.dtype
 
-    selected_scores = helper.create_tmp_variable(dtype=score_type)
-    selected_ids = helper.create_tmp_variable(dtype=id_type)
+    selected_scores = helper.create_variable_for_type_inference(
+        dtype=score_type)
+    selected_ids = helper.create_variable_for_type_inference(dtype=id_type)
 
     helper.append_op(
         type='beam_search',
@@ -3210,8 +3290,8 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
                 ids, scores, beam_size=5, end_id=0)
     """
     helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
-    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_ids = helper.create_variable_for_type_inference(dtype=ids.dtype)
+    sentence_scores = helper.create_variable_for_type_inference(dtype=ids.dtype)
 
     helper.append_op(
         type="beam_search_decode",
@@ -3341,8 +3421,8 @@ def lstm_unit(x_t,
                 param_attr=param_attr,
                 bias_attr=bias_attr)
     dtype = x_t.dtype
-    c = helper.create_tmp_variable(dtype)
-    h = helper.create_tmp_variable(dtype)
+    c = helper.create_variable_for_type_inference(dtype)
+    h = helper.create_variable_for_type_inference(dtype)
 
     helper.append_op(
         type='lstm_unit',
@@ -3396,7 +3476,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
 
     """
     helper = LayerHelper('reduce_sum', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
     helper.append_op(
@@ -3453,7 +3533,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_mean(x, dim=[0, 1]) # [4.0, 5.0]
     """
     helper = LayerHelper('reduce_mean', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
     helper.append_op(
@@ -3508,7 +3588,7 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_max(x, dim=[0, 1]) # [7.0, 8.0]
     """
     helper = LayerHelper('reduce_max', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
     helper.append_op(
@@ -3563,7 +3643,7 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_min(x, dim=[0, 1]) # [1.0, 2.0]
     """
     helper = LayerHelper('reduce_min', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
     helper.append_op(
@@ -3619,7 +3699,7 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_prod(x, dim=[0, 1]) # [105.0, 384.0]
     """
     helper = LayerHelper('reduce_prod', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
     helper.append_op(
@@ -3679,7 +3759,7 @@ def split(input, num_or_sections, dim=-1, name=None):
             dim], 'len(num_or_sections) must not be more than input.shape[dim].'
         num = len(num_or_sections)
     outs = [
-        helper.create_tmp_variable(dtype=helper.input_dtype())
+        helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
     helper.append_op(
@@ -3736,8 +3816,8 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
         axis = 0
     helper = LayerHelper("l2_normalize", **locals())
 
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    norm = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    norm = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type="norm",
         inputs={"X": x},
@@ -3846,7 +3926,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     __check_input(x, y)
 
     helper = LayerHelper('matmul', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='matmul',
         inputs={'X': x,
@@ -3917,8 +3997,8 @@ def topk(input, k, name=None):
             top5_values, top5_indices = layers.topk(input, k=5)
     """
     helper = LayerHelper("top_k", **locals())
-    values = helper.create_tmp_variable(dtype=input.dtype)
-    indices = helper.create_tmp_variable(dtype="int64")
+    values = helper.create_variable_for_type_inference(dtype=input.dtype)
+    indices = helper.create_variable_for_type_inference(dtype="int64")
     helper.append_op(
         type="top_k",
         inputs={"X": [input]},
@@ -3976,8 +4056,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None):
 
     # remove some tokens from input and labels
     if ignored_tokens is not None and len(ignored_tokens) > 0:
-        erased_input = helper.create_tmp_variable(dtype="int64")
-        erased_label = helper.create_tmp_variable(dtype="int64")
+        erased_input = helper.create_variable_for_type_inference(dtype="int64")
+        erased_label = helper.create_variable_for_type_inference(dtype="int64")
 
         helper.append_op(
             type="sequence_erase",
@@ -3994,8 +4074,8 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None):
         label = erased_label
 
     # edit distance op
-    edit_distance_out = helper.create_tmp_variable(dtype="int64")
-    sequence_num = helper.create_tmp_variable(dtype="int64")
+    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
+    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
     helper.append_op(
         type="edit_distance",
         inputs={"Hyps": [input],
@@ -4070,7 +4150,7 @@ def ctc_greedy_decoder(input, blank, name=None):
     _, topk_indices = topk(input, k=1)
 
     # ctc align op
-    ctc_out = helper.create_tmp_variable(dtype="int64")
+    ctc_out = helper.create_variable_for_type_inference(dtype="int64")
     helper.append_op(
         type="ctc_align",
         inputs={"Input": [topk_indices]},
@@ -4120,8 +4200,8 @@ def warpctc(input, label, blank=0, norm_by_times=False):
 
     """
     helper = LayerHelper('warpctc', **locals())
-    loss_out = helper.create_tmp_variable(dtype=input.dtype)
-    grad_out = helper.create_tmp_variable(dtype=input.dtype)
+    loss_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    grad_out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type='warpctc',
         inputs={'Logits': [input],
@@ -4182,7 +4262,7 @@ def sequence_reshape(input, new_dim):
             x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
     """
     helper = LayerHelper('sequence_reshape', **locals())
-    out = helper.create_tmp_variable(helper.input_dtype())
+    out = helper.create_variable_for_type_inference(helper.input_dtype())
     helper.append_op(
         type='sequence_reshape',
         inputs={'X': [input]},
@@ -4279,9 +4359,9 @@ def nce(input,
             is_bias=True,
             dtype=input.dtype)
         inputs['Bias'] = b
-    cost = helper.create_tmp_variable(dtype=input.dtype)
-    sample_logits = helper.create_tmp_variable(dtype=input.dtype)
-    sample_labels = helper.create_tmp_variable(dtype=label.dtype)
+    cost = helper.create_variable_for_type_inference(dtype=input.dtype)
+    sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype)
+    sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype)
 
     if num_neg_samples is None:
         num_neg_samples = 10
@@ -4357,8 +4437,8 @@ def hsigmoid(input,
 
     helper = LayerHelper('hierarchical_sigmoid', **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
-    pre_out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
+    pre_out = helper.create_variable_for_type_inference(dtype)
     dim = input.shape[1]
     if num_classes < 2:
         raise ValueError("num_classes must not be less than 2.")
@@ -4402,7 +4482,10 @@ def transpose(x, perm, name=None):
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32')
+            # use append_batch_size=False to avoid prepending extra 
+            # batch size in shape
+            x = fluid.layers.data(name='x', shape=[5, 10, 15], 
+                            dtype='float32', append_batch_size=False)
             x_transposed = layers.transpose(x, perm=[1, 0, 2])
     """
 
@@ -4418,8 +4501,8 @@ def transpose(x, perm, name=None):
                 (idx, perm[idx], len(x.shape)))
 
     helper = LayerHelper('transpose', **locals())
-    out = helper.create_tmp_variable(x.dtype)
-    x_shape = helper.create_tmp_variable(x.dtype)
+    out = helper.create_variable_for_type_inference(x.dtype)
+    x_shape = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
         type='transpose2',
         inputs={'X': [x]},
@@ -4561,7 +4644,7 @@ def im2sequence(input,
         inputs["Y"] = input_image_size
         attrs["out_stride"] = out_stride
     helper = LayerHelper('im2sequence', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     helper.append_op(
         type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
     return out
@@ -4594,7 +4677,7 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     filter_shape = [future_context_size + 1, input.shape[1]]
     filter_param = helper.create_parameter(
         attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='row_conv',
         inputs={'X': [input],
@@ -4627,7 +4710,7 @@ def multiplex(inputs, index):
         raise ValueError("inputs should be a list object and contains at least "
                          "2 elements.")
 
-    out = helper.create_tmp_variable(inputs[0].dtype)
+    out = helper.create_variable_for_type_inference(inputs[0].dtype)
     helper.append_op(
         type='multiplex',
         inputs={'X': inputs,
@@ -4639,7 +4722,8 @@ def multiplex(inputs, index):
 def softmax_with_cross_entropy(logits,
                                label,
                                soft_label=False,
-                               ignore_index=-100):
+                               ignore_index=-100,
+                               numeric_stable_mode=False):
     """
     **Softmax With Cross Entropy Operator.**
 
@@ -4673,6 +4757,18 @@ def softmax_with_cross_entropy(logits,
         \\left(\\text{logit}_i - \\log\\left(\\sum_{i=0}^{K}
         \\exp(\\text{logit}_i)\\right)\\right), j = 1,...,K
 
+    3) If numeric_stable_mode is True, softmax is calculated first by:
+
+    .. math::
+        
+        max_j = \\max_{i=0}^{K}{\\text{logit}_i}
+
+        log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
+
+        softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j)
+
+    and then cross entropy loss is calculated by softmax and label.
+
     Args:
         logits (Variable): The unscaled log probabilities, which is a 2-D tensor
             with shape [N x K]. N is the batch_size, and K is the class number.
@@ -4684,6 +4780,13 @@ def softmax_with_cross_entropy(logits,
         ignore_index (int): Specifies a target value that is ignored and does
                             not contribute to the input gradient. Only valid
                             if soft_label is set to False. Default: -100
+        numeric_stable_mode (bool): A flag to indicate whether to use a more
+                                    numerically stable algorithm. Only valid
+                                    when soft_label is False and GPU is used.
+                                    When soft_label is True or CPU is used, 
+                                    the algorithm is always numerically stable. 
+                                    Note that the speed may be slower when use 
+                                    stable algorithm. Default: False
 
     Returns:
         Variable: The cross entropy loss is a 2-D tensor with shape [N x 1].
@@ -4698,16 +4801,19 @@ def softmax_with_cross_entropy(logits,
                 logits=fc, label=label)
     """
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
-    softmax = helper.create_tmp_variable(dtype=logits.dtype)
-    loss = helper.create_tmp_variable(dtype=logits.dtype)
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
     helper.append_op(
         type='softmax_with_cross_entropy',
         inputs={'Logits': logits,
                 'Label': label},
         outputs={'Softmax': softmax,
                  'Loss': loss},
-        attrs={'soft_label': soft_label,
-               'ignore_index': ignore_index})
+        attrs={
+            'soft_label': soft_label,
+            'ignore_index': ignore_index,
+            'numeric_stable_mode': numeric_stable_mode
+        })
     return loss
 
 
@@ -4749,8 +4855,8 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
 
     helper = LayerHelper('smooth_l1_loss', **locals())
-    diff = helper.create_tmp_variable(dtype=x.dtype)
-    loss = helper.create_tmp_variable(dtype=x.dtype)
+    diff = helper.create_variable_for_type_inference(dtype=x.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='smooth_l1_loss',
         inputs={
@@ -4783,7 +4889,7 @@ def one_hot(input, depth):
             one_hot_label = layers.one_hot(input=label, depth=10)
     """
     helper = LayerHelper("one_hot", **locals())
-    one_hot_out = helper.create_tmp_variable(dtype='float32')
+    one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
     helper.append_op(
         type="one_hot",
         inputs={'X': input},
@@ -4831,7 +4937,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     return counter
 
 
-def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
+def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     """
     Gives a new shape to the input Tensor without changing its data.
 
@@ -4879,15 +4985,22 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                                 :attr:`shape` specifying shape. That is to
                                 say :attr:`actual_shape` has a higher priority
                                 than :attr:`shape`.
-        act (str): The non-linear activation to be applied to output variable.
-        inplace(bool): If this flag is set true, the output
-                       shares data with input without copying, otherwise
-                       a new output tensor is created
-                       whose data is copied from input x.
+        act (str): The non-linear activation to be applied to the reshaped tensor
+                   variable.
+        inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple
+                       operators. If this flag is set :attr:`True`, reuse input
+                       :attr:`x` to reshape, which will change the shape of
+                       tensor variable :attr:`x` and might cause errors when
+                       :attr:`x` is used in multiple operators. If :attr:`False`,
+                       preserve the shape :attr:`x` and create a new output tensor
+                       variable whose data is copied from input x but reshaped.
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: The output tensor.
+        Variable: The reshaped tensor variable if :attr:`act` is None. It is a \
+                  new tensor variable if :attr:`inplace` is :attr:`False`, \
+                  otherwise it is :attr:`x`. If :attr:`act` is not None, return \
+                  the activated tensor variable.
 
     Raises:
         TypeError: if actual_shape is neither Variable nor None.
@@ -4898,7 +5011,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
             data = fluid.layers.data(
                 name='data', shape=[2, 4, 6], dtype='float32')
             reshaped = fluid.layers.reshape(
-                x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True)
+                x=data, shape=[-1, 0, 3, 2], inplace=True)
     """
 
     if not (isinstance(shape, list) or isinstance(shape, tuple)):
@@ -4925,8 +5038,9 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                 "except one unknown dimension.")
 
     helper = LayerHelper("reshape2", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    x_shape = helper.create_tmp_variable(dtype=x.dtype)
+    out = x if inplace else helper.create_variable_for_type_inference(
+        dtype=x.dtype)
+    x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type="reshape2",
         inputs=inputs,
@@ -4975,8 +5089,8 @@ def squeeze(input, axes, name=None):
             y = layers.sequeeze(input=x, axes=[1])
     """
     helper = LayerHelper("squeeze", **locals())
-    out = helper.create_tmp_variable(dtype=input.dtype)
-    x_shape = helper.create_tmp_variable(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type="squeeze2",
         inputs={"X": input},
@@ -5012,8 +5126,8 @@ def unsqueeze(input, axes, name=None):
             y = layers.unsequeeze(input=x, axes=[1])
     """
     helper = LayerHelper("unsqueeze", **locals())
-    out = helper.create_tmp_variable(dtype=input.dtype)
-    x_shape = helper.create_tmp_variable(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
         type="unsqueeze2",
         inputs={"X": input},
@@ -5103,7 +5217,7 @@ def lod_reset(x, y=None, target_lod=None):
             out = layers.lod_reset(x=x, y=y)
     """
     helper = LayerHelper("lod_reset", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     if y is not None:
         helper.append_op(
             type="lod_reset", inputs={'X': x,
@@ -5172,8 +5286,9 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
             "dims of input must be 4(not %d), and it's order must be NCHW" %
             (dims))
 
-    mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
-    lrn_out = helper.create_tmp_variable(dtype)
+    mid_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True)
+    lrn_out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="lrn",
         inputs={"X": input},
@@ -5238,7 +5353,7 @@ def pad(x, paddings, pad_value=0., name=None):
     """
     helper = LayerHelper('pad', input=x, **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='pad',
         inputs={'X': x},
@@ -5318,7 +5433,7 @@ def pad_constant_like(x, y, pad_value=0., name=None):
     """
     helper = LayerHelper('pad_constant_like', input=x, **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='pad_constant_like',
         inputs={'X': x,
@@ -5383,7 +5498,7 @@ def label_smooth(label,
         raise ValueError("The value of epsilon must be between 0 and 1.")
     helper = LayerHelper("label_smooth", **locals())
     label.stop_gradient = True
-    smooth_label = helper.create_tmp_variable(dtype)
+    smooth_label = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="label_smooth",
         inputs={"X": label,
@@ -5415,8 +5530,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
     """
     helper = LayerHelper('roi_pool', **locals())
     dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-    argmaxes = helper.create_tmp_variable(dtype='int32')
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
         type="roi_pool",
         inputs={"X": input,
@@ -5455,16 +5570,16 @@ def roi_align(input,
     Examples:
         .. code-block:: python
 
-            align_out = fluid.layers.roi_align(input=x, 
-                                               rois=rois, 
-                                               pooled_height=7, 
+            align_out = fluid.layers.roi_align(input=x,
+                                               rois=rois,
+                                               pooled_height=7,
                                                pooled_width=7,
                                                spatial_scale=0.5,
                                                sampling_ratio=-1)
     """
     helper = LayerHelper('roi_align', **locals())
     dtype = helper.input_dtype()
-    align_out = helper.create_tmp_variable(dtype)
+    align_out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="roi_align",
         inputs={"X": input,
@@ -5589,7 +5704,7 @@ def image_resize(input,
         out_h = int(input.shape[2] * scale)
         out_w = int(input.shape[3] * scale)
 
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type=resample_methods[resample],
         inputs=inputs,
@@ -5698,7 +5813,7 @@ def gather(input, index):
     """
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="gather",
         inputs={"X": input,
@@ -5738,7 +5853,7 @@ def scatter(input, index, updates, name=None):
     """
     helper = LayerHelper('scatter', **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="scatter",
         inputs={"X": input,
@@ -5798,7 +5913,7 @@ def sequence_scatter(input, index, updates, name=None):
     """
     helper = LayerHelper('sequence_scatter', **locals())
     dtype = helper.input_dtype()
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="sequence_scatter",
         inputs={"X": input,
@@ -5828,7 +5943,7 @@ def random_crop(x, shape, seed=None):
     """
     helper = LayerHelper("random_crop", **locals())
     dtype = x.dtype
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     if seed is None:
         seed = np.random.randint(-65536, 65536)
     op_attrs = {"shape": shape}
@@ -5874,7 +5989,7 @@ def log(x, name=None):
     """
     helper = LayerHelper('log', **locals())
     dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out})
     return out
 
@@ -5905,7 +6020,7 @@ def relu(x, name=None):
     """
     helper = LayerHelper('relu', **locals())
     dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out})
     return out
 
@@ -5944,9 +6059,9 @@ def mean_iou(input, label, num_classes):
     """
     helper = LayerHelper('mean_iou', **locals())
     dtype = helper.input_dtype()
-    out_mean_iou = helper.create_tmp_variable(dtype='float32')
-    out_wrong = helper.create_tmp_variable(dtype='int32')
-    out_correct = helper.create_tmp_variable(dtype='int32')
+    out_mean_iou = helper.create_variable_for_type_inference(dtype='float32')
+    out_wrong = helper.create_variable_for_type_inference(dtype='int32')
+    out_correct = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
         type="mean_iou",
         inputs={"Predictions": input,
@@ -6038,7 +6153,7 @@ def crop(x, shape=None, offsets=None, name=None):
     if offsets is None:
         offsets = [0] * len(x.shape)
 
-    out = helper.create_tmp_variable(x.dtype)
+    out = helper.create_variable_for_type_inference(x.dtype)
     ipts = {'X': x}
     attrs = {}
     if isinstance(shape, Variable):
@@ -6058,6 +6173,124 @@ def crop(x, shape=None, offsets=None, name=None):
     return out
 
 
+def affine_grid(theta, out_shape, name=None):
+    """
+    It generates a grid of (x,y) coordinates using the parameters of
+    the affine transformation that correspond to a set of points where
+    the input feature map should be sampled to produce the transformed
+    output feature map.
+
+    .. code-block:: text
+
+        * Case 1:
+
+          Given:
+
+              theta = [[[x_11, x_12, x_13]
+                        [x_14, x_15, x_16]]
+                       [[x_21, x_22, x_23]
+                        [x_24, x_25, x_26]]]
+      
+              out_shape = [2, 3, 5, 5]
+      
+          Step 1:
+      
+              Generate normalized coordinates according to out_shape.
+              The values of the normalized coordinates are in the interval between -1 and 1.
+              The shape of the normalized coordinates is [2, H, W] as below:
+      
+              C = [[[-1.  -1.  -1.  -1.  -1. ]
+                    [-0.5 -0.5 -0.5 -0.5 -0.5]
+                    [ 0.   0.   0.   0.   0. ]
+                    [ 0.5  0.5  0.5  0.5  0.5]
+                    [ 1.   1.   1.   1.   1. ]]
+                   [[-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]
+                    [-1.  -0.5  0.   0.5  1. ]]]
+              C[0] is the coordinates in height axis and  C[1] is the coordinates in width axis.
+
+          Step2:
+
+              Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
+              C_ = [[-1.  -1.   1. ]
+                    [-0.5 -1.   1. ]
+                    [ 0.  -1.   1. ]
+                    [ 0.5 -1.   1. ]
+                    [ 1.  -1.   1. ]
+                    [-1.  -0.5  1. ]
+                    [-0.5 -0.5  1. ]
+                    [ 0.  -0.5  1. ]
+                    [ 0.5 -0.5  1. ]
+                    [ 1.  -0.5  1. ]
+                    [-1.   0.   1. ]
+                    [-0.5  0.   1. ]
+                    [ 0.   0.   1. ]
+                    [ 0.5  0.   1. ]
+                    [ 1.   0.   1. ]
+                    [-1.   0.5  1. ]
+                    [-0.5  0.5  1. ]
+                    [ 0.   0.5  1. ]
+                    [ 0.5  0.5  1. ]
+                    [ 1.   0.5  1. ]
+                    [-1.   1.   1. ]
+                    [-0.5  1.   1. ]
+                    [ 0.   1.   1. ]
+                    [ 0.5  1.   1. ]
+                    [ 1.   1.   1. ]]
+          Step3:
+              Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
+
+    Args:
+        theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
+        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
+        out_shape can be a Variable or a list or tuple.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The output with shape [N, H, W, 2].
+
+    Raises:
+        ValueError: If the type of arguments is not supported.
+
+    Examples:
+
+        .. code-block:: python
+            theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
+            out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
+            data = fluid.layers.affine_grid(theta, out_shape)
+
+            # or
+            data = fluid.layers.affine_grid(theta, [5, 3, 28, 28])
+
+    """
+    helper = LayerHelper('affine_grid')
+
+    if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
+        isinstance(out_shape, Variable)):
+        raise ValueError("The out_shape should be a list, tuple or Variable.")
+
+    if not isinstance(theta, Variable):
+        raise ValueError("The theta should be a Variable.")
+
+    out = helper.create_variable_for_type_inference(theta.dtype)
+    ipts = {'Theta': theta}
+    attrs = {}
+    if isinstance(out_shape, Variable):
+        ipts['OutputShape'] = out_shape
+    else:
+        attrs['output_shape'] = out_shape
+
+    helper.append_op(
+        type='affine_grid',
+        inputs=ipts,
+        outputs={'Output': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
+
+
 def rank_loss(label, left, right, name=None):
     """
     **Rank loss layer for RankNet**
@@ -6118,7 +6351,7 @@ def rank_loss(label, left, right, name=None):
     if not (isinstance(right, Variable)):
         raise ValueError("The right should be a Variable")
 
-    out = helper.create_tmp_variable("float32")
+    out = helper.create_variable_for_type_inference("float32")
 
     helper.append_op(
         type='rank_loss',
@@ -6164,8 +6397,8 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
         raise ValueError("The left should be a Variable.")
     if not isinstance(right, Variable):
         raise ValueError("The right should be a Variable.")
-    out = helper.create_tmp_variable(left.dtype)
-    act = helper.create_tmp_variable(left.dtype)
+    out = helper.create_variable_for_type_inference(left.dtype)
+    act = helper.create_variable_for_type_inference(left.dtype)
     helper.append_op(
         type='margin_rank_loss',
         inputs={"Label": label,
@@ -6250,7 +6483,7 @@ def pad2d(input,
 
     helper = LayerHelper('pad2d', **locals())
     dtype = helper.input_dtype(input_param_name='input')
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='pad2d',
         inputs={'X': input},
@@ -6279,7 +6512,7 @@ def elu(x, alpha=1.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('elu', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='elu',
         inputs={'X': x},
@@ -6302,7 +6535,7 @@ def relu6(x, threshold=6.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('relu6', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='relu6',
         inputs={'X': x},
@@ -6325,7 +6558,7 @@ def pow(x, factor=1.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('pow', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='pow',
         inputs={'X': x},
@@ -6349,7 +6582,7 @@ def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('stanh', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='stanh',
         inputs={'X': x},
@@ -6374,7 +6607,7 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('hard_sigmoid', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='hard_sigmoid',
         inputs={'X': x},
@@ -6398,7 +6631,7 @@ def swish(x, beta=1.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('swish', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='swish',
         inputs={'X': x},
@@ -6450,7 +6683,7 @@ def prelu(x, mode, param_attr=None, name=None):
         dtype='float32',
         is_bias=False,
         default_initializer=Constant(1.0))
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="prelu",
         inputs={"X": x,
@@ -6474,7 +6707,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('brelu', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='brelu',
         inputs={'X': x},
@@ -6497,7 +6730,7 @@ def leaky_relu(x, alpha=0.02, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('leaky_relu', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='leaky_relu',
         inputs={'X': x},
@@ -6519,7 +6752,7 @@ def soft_relu(x, threshold=40.0, name=None):
         output(${out_type}): ${out_comment}
     """
     helper = LayerHelper('soft_relu', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='soft_relu',
         inputs={'X': x},
@@ -6586,8 +6819,8 @@ def flatten(x, axis=1, name=None):
     if not (isinstance(axis, int)) or axis > len(x.shape) or axis < 0:
         raise ValueError("The axis should be a int, and in range [0, rank(x)]")
 
-    out = helper.create_tmp_variable(x.dtype)
-    x_shape = helper.create_tmp_variable(x.dtype)
+    out = helper.create_variable_for_type_inference(x.dtype)
+    x_shape = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
         type='flatten2',
         inputs={"X": x},
@@ -6633,7 +6866,8 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
             out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
     helper = LayerHelper('sequence_enumerate', **locals())
-    out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True)
+    out = helper.create_variable_for_type_inference(
+        helper.input_dtype(), stop_gradient=True)
     helper.append_op(
         type='sequence_enumerate',
         inputs={'X': input},
@@ -6673,9 +6907,9 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
 
     helper = LayerHelper('sequence_mask', **locals())
     if name is None:
-        out = helper.create_tmp_variable(dtype=dtype)
+        out = helper.create_variable_for_type_inference(dtype=dtype)
     else:
-        out = helper.create_tmp_variable(dtype=dtype, name=name)
+        out = helper.create_variable_for_type_inference(dtype=dtype, name=name)
 
     helper.append_op(
         type='sequence_mask',
@@ -6718,7 +6952,7 @@ def stack(x, axis=0):
     if not isinstance(x, list) and not isinstance(x, tuple):
         x = [x]
 
-    out = helper.create_tmp_variable(x[0].dtype)
+    out = helper.create_variable_for_type_inference(x[0].dtype)
     helper.append_op(
         type='stack', inputs={'X': x}, outputs={'Y': out},
         attrs={'axis': axis})
@@ -6756,7 +6990,7 @@ def unstack(x, axis=0, num=None):
 
     outs = []
     for _ in num:
-        outs.append(helper.create_tmp_variable(x.dtype))
+        outs.append(helper.create_variable_for_type_inference(x.dtype))
 
     helper.append_op(
         type='unstack',
@@ -6808,7 +7042,7 @@ def expand(x, expand_times, name=None):
     """
     helper = LayerHelper('expand', input=x, **locals())
     dtype = helper.input_dtype(input_param_name='x')
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='expand',
         inputs={'X': x},
@@ -6847,7 +7081,7 @@ def uniform_random_batch_size_like(input,
     """
 
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
     helper.append_op(
         type='uniform_random_batch_size_like',
@@ -6884,7 +7118,7 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
     """
 
     helper = LayerHelper('gaussian_random', **locals())
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
     helper.append_op(
         type='gaussian_random',
@@ -6919,7 +7153,7 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
     """
 
     helper = LayerHelper('sampling_id', **locals())
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='sampling_id',
         inputs={'X': x},
@@ -6958,7 +7192,7 @@ def gaussian_random_batch_size_like(input,
     """
 
     helper = LayerHelper('gaussian_random_batch_size_like', **locals())
-    out = helper.create_tmp_variable(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
     helper.append_op(
         type='gaussian_random_batch_size_like',
@@ -6990,7 +7224,8 @@ def sum(x):
     """
 
     helper = LayerHelper('sum', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype('x'))
+    out = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype('x'))
     helper.append_op(
         type='sum',
         inputs={'X': x},
@@ -7017,7 +7252,8 @@ def slice(input, axes, starts, ends):
     """
 
     helper = LayerHelper('slice', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
+    out = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype('input'))
     helper.append_op(
         type='slice',
         inputs={'Input': input},
@@ -7043,7 +7279,8 @@ def shape(input):
     """
 
     helper = LayerHelper('shape', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype('input'))
+    out = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype('input'))
     helper.append_op(
         type='shape', inputs={'Input': input}, outputs={'Out': out})
 
@@ -7060,7 +7297,7 @@ def _elementwise_op(helper):
     use_mkldnn = helper.kwargs.get('use_mkldnn', False)
     name = helper.kwargs.get('name', None)
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7094,7 +7331,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
 
     helper = LayerHelper('scale', **locals())
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7160,7 +7397,7 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
 
     if out is None:
         if name is None:
-            out = helper.create_tmp_variable(dtype=x.dtype)
+            out = helper.create_variable_for_type_inference(dtype=x.dtype)
         else:
             out = helper.create_variable(
                 name=name, dtype=x.dtype, persistable=False)
@@ -7268,7 +7505,7 @@ def clip(x, min, max, name=None):
     helper = LayerHelper("clip", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7300,7 +7537,7 @@ def clip_by_norm(x, max_norm, name=None):
     helper = LayerHelper("clip_by_norm", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7330,7 +7567,7 @@ def mean(x, name=None):
     helper = LayerHelper("mean", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7360,7 +7597,7 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
     helper = LayerHelper("mul", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7394,7 +7631,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None):
     helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7424,7 +7661,7 @@ def maxout(x, groups, name=None):
     helper = LayerHelper("maxout", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7437,13 +7674,40 @@ def maxout(x, groups, name=None):
     return out
 
 
+@templatedoc()
+def sequence_reverse(x, name=None):
+    """ 
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        name(basestring|None): Name of the output.
+
+    Returns:
+        out(${y_type}): ${y_comment}
+    """
+    helper = LayerHelper("sequence_reverse", **locals())
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+
+    helper.append_op(
+        type="sequence_reverse",
+        inputs={"X": x},
+        outputs={"Y": out},
+        attrs=dict())
+    return out
+
+
 def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
     """
     Applies a separate affine transformation to each channel of the input.
     Useful for replacing spatial batch norm with its equivalent fixed
     transformation. The input also can be 2D tensor and applies a affine
     transformation in second dimension.
-    
+
     Args:
         x (Variable): Feature map input can be a 4D tensor with order NCHW
             or NHWC. It also can be a 2D tensor and the affine transformation
@@ -7463,7 +7727,7 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
     helper = LayerHelper("affine_channel", **locals())
 
     if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = helper.create_variable(
             name=name, dtype=x.dtype, persistable=False)
@@ -7476,3 +7740,248 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
         attrs={"data_layout": data_layout},
         outputs={"Out": out})
     return out
+
+
+def hash(input, hash_size, num_hash=1, name=None):
+    """
+    Hash the input to an integer whose value is less than the given hash size.
+
+    The hash algorithm we used was xxHash - Extremely fast hash algorithm
+    (https://github.com/Cyan4973/xxHash/tree/v0.6.5)
+
+    A simple example as below:
+
+    .. code-block:: text
+
+        Given:
+
+        # shape [2, 2]
+        input.data = [
+            [[1], [2]],
+            [[3], [4]],
+        ]
+
+        input.lod = [[0, 2]]
+
+        hash_size = 10000
+
+        num_hash = 4
+
+        Then:
+
+        Hash op will take all number in input's 2nd dimension as hash algorithm's
+        input for each time. Each input will be hashed for 4 times, and get an
+        array whose length is 4. Each value in the array ranges from 0 to 9999.
+
+        # shape [2, 4]
+        output.data = [
+            [[9662], [9217], [1129], [8487]],
+            [[8310], [1327], [1654], [4567]],
+        ]
+
+        output.lod = [[0, 2]]
+
+    Args:
+        input (Variable): The input variable which is a one-hot word. The
+            dimensions of the input variable must be 2.
+        hash_size (int): The space size for hash algorithm. The output value
+            will keep in the range:math:`[0, hash_size - 1]`.
+        num_hash (int): The times of hash, default 1.
+        name (str, default None): The name of this layer.
+
+    Returns:
+       Variable: The hash result variable which is a LoDTensor.
+
+    Examples:
+       .. code-block:: python
+           word_dict = paddle.dataset.imdb.word_dict()
+           x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
+           out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000)
+    """
+    helper = LayerHelper('hash', **locals())
+    out = helper.create_variable_for_type_inference(
+        helper.input_dtype(), stop_gradient=True)
+    helper.append_op(
+        type='hash',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'num_hash': num_hash,
+               'mod_by': hash_size})
+    return out
+
+
+@templatedoc()
+def grid_sampler(x, grid, name=None):
+    """
+    This operation samples input X by using bilinear interpolation based on 
+    flow field grid, which is usually gennerated by affine_grid. The grid of
+    shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
+    with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
+    (in width dimension) of input data x and grid_y is indexng the 3rd 
+    dimention (in height dimension), finally results is the bilinear 
+    interpolation value of 4 nearest corner points.
+
+    Step 1:
+    Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+
+    grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+    grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+
+    Step 2:
+    Indices input data X with grid (x, y) in each [H, W] area, and bilinear 
+    interpolate point value by 4 nearest points.
+
+      wn ------- y_n ------- en
+      |           |           |
+      |          d_n          |
+      |           |           |
+     x_w --d_w-- grid--d_e-- x_e
+      |           |           |
+      |          d_s          |
+      |           |           |
+      ws ------- y_s ------- wn
+
+    x_w = floor(x)              // west side x coord
+    x_e = x_w + 1               // east side x coord
+    y_n = floor(y)              // north side y coord
+    y_s = y_s + 1               // south side y coord
+
+    d_w = grid_x - x_w          // distance to west side
+    d_e = x_e - grid_x          // distance to east side
+    d_n = grid_y - y_n          // distance to north side
+    d_s = y_s - grid_y          // distance to south side
+
+    wn = X[:, :, y_n, x_w]      // north-west point value
+    en = X[:, :, y_n, x_e]      // north-east point value
+    ws = X[:, :, y_s, x_w]      // south-east point value
+    es = X[:, :, y_s, x_w]      // north-east point value
+
+    output = wn * d_e * d_s + en * d_w * d_s
+           + ws * d_e * d_n + es * d_w * d_n
+
+    Args:
+        x(Variable): Input data of shape [N, C, H, W].
+        grid(Variable): Input grid tensor of shape [N, H, W, 2].
+        name (str, default None): The name of this layer.
+
+    Returns:
+        out(Variable): Output of shape [N, C, H, W] data samples input X 
+        using bilnear interpolation based on input grid.
+
+    Exmples:
+    .. code-block:: python
+
+        x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
+        theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
+        grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
+        out = fluid.layers.grid_sampler(x=x, grid=grid)
+    """
+    helper = LayerHelper("grid_sampler", **locals())
+
+    if not isinstance(x, Variable):
+        return ValueError("The x should be a Variable")
+
+    if not isinstance(grid, Variable):
+        return ValueError("The grid should be a Variable")
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    ipts = {'X': x, 'Grid': grid}
+
+    helper.append_op(type='grid_sampler', inputs=ipts, outputs={'Output': out})
+    return out
+
+
+def log_loss(input, label, epsilon=1e-4, name=None):
+    """
+    **Negative Log Loss Layer**
+
+    This layer accepts input predictions and target label and returns the
+    negative log loss.
+
+    .. math::
+
+        Out = -label * \\log{(input + \\epsilon)}
+              - (1 - label) * \\log{(1 - input + \\epsilon)}
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
+                                batch size. This input is a probability computed
+                                by the previous operator.
+        label (Variable|list):  the ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size.
+        epsilon (float): epsilon
+        name (string): the name of log_loss
+
+    Returns:
+        Variable: A 2-D tensor with shape [N x 1], the negative log loss.
+
+    Examples:
+        .. code-block:: python
+
+          prob = fluid.layers.sigmoid(net)
+          cost = fluid.layers.log_loss(input=prob, label=label)
+    """
+    helper = LayerHelper('log_loss', **locals())
+
+    if name is None:
+        loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+    else:
+        loss = helper.create_variable(
+            name=name, dtype=input.dtype, persistable=False)
+
+    helper.append_op(
+        type='log_loss',
+        inputs={'Predicted': [input],
+                'Labels': [label]},
+        outputs={'Loss': [loss]},
+        attrs={'epsilon': epsilon})
+    return loss
+
+
+def add_position_encoding(input, alpha, beta, name=None):
+    """
+    **Add Position Encoding Layer**
+
+    This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
+    output Tensor of shape [N x M x P] with positional encoding value.
+
+    Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
+
+    .. math::
+        PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})}   \\\\
+        PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})}  \\\\
+        Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i)
+
+    Where:
+    * PE(pos, 2i): the increment for the number at even position
+    * PE(pos, 2i + 1): the increment for the number at odd position
+
+    Args:
+        input (Variable): 3-D input tensor with shape [N x M x P]
+        alpha (float): multiple of Input Tensor
+        beta (float): multiple of Positional Encoding Tensor
+        name (string): the name of position encoding layer
+
+    Returns:
+        Variable: A 3-D Tensor of shape [N x M x P] with positional encoding.
+
+    Examples:
+        .. code-block:: python
+
+          position_tensor = fluid.layers.add_position_encoding(input=tensor)
+    """
+    helper = LayerHelper('add_position_encoding', **locals())
+    dtype = helper.input_dtype()
+
+    if name is None:
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+    else:
+        out = helper.create_variable(name=name, dtype=dtype, persistable=False)
+
+    helper.append_op(
+        type="add_position_encoding",
+        inputs={"X": input},
+        outputs={"Out": out},
+        attrs={"alpha": alpha,
+               "beta": beta})
+    return out
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 9c6a2112a6..09a7cb8dc9 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -152,7 +152,7 @@ def cast(x, dtype):
             result = fluid.layers.cast(x=data, dtype='float64')
     """
     helper = LayerHelper('cast', **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
+    out = helper.create_variable_for_type_inference(dtype=dtype)
     helper.append_op(
         type='cast',
         inputs={'X': [x]},
@@ -184,7 +184,7 @@ def concat(input, axis=0, name=None):
            out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
     """
     helper = LayerHelper('concat', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     helper.append_op(
         type='concat',
         inputs={'X': input},
@@ -221,7 +221,8 @@ def sums(input, out=None):
     """
     helper = LayerHelper('sum', **locals())
     if out is None:
-        out = helper.create_tmp_variable(dtype=helper.input_dtype())
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
     helper.append_op(
         type='sum',
         inputs={'X': input},
@@ -252,7 +253,7 @@ def assign(input, output=None):
     """
     helper = LayerHelper('assign', **locals())
     if output is None:
-        output = helper.create_tmp_variable(dtype=input.dtype)
+        output = helper.create_variable_for_type_inference(dtype=input.dtype)
     if isinstance(input, Variable):
         helper.append_op(
             type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
@@ -311,7 +312,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
 
     helper = LayerHelper("fill_constant", **locals())
     if out is None:
-        out = helper.create_tmp_variable(dtype=dtype)
+        out = helper.create_variable_for_type_inference(dtype=dtype)
     helper.append_op(
         type='fill_constant',
         inputs={},
@@ -358,7 +359,7 @@ def fill_constant_batch_size_like(input,
         ${out_comment}.
     """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
+    out = helper.create_variable_for_type_inference(dtype=dtype)
     helper.append_op(
         type='fill_constant_batch_size_like',
         inputs={'Input': input},
@@ -396,7 +397,7 @@ def argmin(x, axis=0):
           out = fluid.layers.argmin(x=in, axis=-1)
     """
     helper = LayerHelper("arg_min", **locals())
-    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
     helper.append_op(
         type='arg_min',
         inputs={'X': x},
@@ -427,7 +428,7 @@ def argmax(x, axis=0):
           out = fluid.layers.argmax(x=in, axis=-1)
     """
     helper = LayerHelper("arg_max", **locals())
-    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
     helper.append_op(
         type='arg_max',
         inputs={'X': x},
@@ -477,8 +478,10 @@ def argsort(input, axis=-1, name=None):
             out, indices = fluid.layers.argsort(input, axis=0)
     """
     helper = LayerHelper("argsort", **locals())
-    out = helper.create_tmp_variable(dtype=input.dtype, stop_gradient=True)
-    ids = helper.create_tmp_variable(VarDesc.VarType.INT64, stop_gradient=True)
+    out = helper.create_variable_for_type_inference(
+        dtype=input.dtype, stop_gradient=True)
+    ids = helper.create_variable_for_type_inference(
+        VarDesc.VarType.INT64, stop_gradient=True)
     helper.append_op(
         type='argsort',
         inputs={'X': input},
@@ -562,7 +565,7 @@ def reverse(x, axis):
     if isinstance(axis, int):
         axis = [axis]
     helper = LayerHelper("reverse", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='reverse',
         inputs={'Input': x},
@@ -654,7 +657,7 @@ def has_inf(x):
         Variable: The tensor variable storing the output, only a bool value.
     """
     helper = LayerHelper("isinf", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="isinf", inputs={"X": x}, outputs={"Out": out})
     return out
 
@@ -670,7 +673,7 @@ def has_nan(x):
         Variable: The tensor variable storing the output, only a bool value.
     """
     helper = LayerHelper("isnan", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="isnan", inputs={"X": x}, outputs={"Out": out})
     return out
 
@@ -687,6 +690,6 @@ def isfinite(x):
         Variable: The tensor variable storing the output, contains a bool value.
     """
     helper = LayerHelper("isfinite", **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out})
     return out
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 0c2800dcf3..f65b37903a 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 """
 Fluid Metrics
-
-The metrics are accomplished via Python natively.
 """
 
 from __future__ import print_function
@@ -24,6 +22,12 @@ import copy
 import warnings
 import six
 
+from .layer_helper import LayerHelper
+from .initializer import Constant
+from . import unique_name
+from .framework import Program, Variable, program_guard
+from . import layers
+
 __all__ = [
     'MetricBase',
     'CompositeMetric',
@@ -190,7 +194,7 @@ class CompositeMetric(MetricBase):
                                or soft-label, should custom the corresponding update rule.
         """
         for m in self._metrics:
-            ans.append(m.update(preds, labels))
+            m.update(preds, labels)
 
     def eval(self):
         """
@@ -474,71 +478,10 @@ class EditDistance(MetricBase):
                 "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
             )
         avg_distance = self.total_distance / self.seq_num
-        avg_instance_error = self.instance_error / self.seq_num
+        avg_instance_error = self.instance_error / float(self.seq_num)
         return avg_distance, avg_instance_error
 
 
-class DetectionMAP(MetricBase):
-    """
-    Calculate the detection mean average precision (mAP).
-    mAP is the metric to measure the accuracy of object detectors
-    like Faster R-CNN, SSD, etc.
-    It is the average of the maximum precisions at different recall values.
-    Please get more information from the following articles:
-      https://sanchom.wordpress.com/tag/average-precision/
-
-      https://arxiv.org/abs/1512.02325
-
-    The general steps are as follows:
-
-        1. calculate the true positive and false positive according to the input
-            of detection and labels.
-        2. calculate mAP value, support two versions: '11 point' and 'integral'.
-
-    Examples:
-        .. code-block:: python
-
-            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
-            batch_map = layers.detection_map(
-                input,
-                label,
-                class_num,
-                background_label,
-                overlap_threshold=overlap_threshold,
-                evaluate_difficult=evaluate_difficult,
-                ap_version=ap_version)
-            metric = fluid.metrics.DetectionMAP()
-            for data in train_reader():
-                loss, preds, labels = exe.run(fetch_list=[cost, batch_map])
-                batch_size = data[0]
-                metric.update(value=batch_map, weight=batch_size)
-                numpy_map = metric.eval()
-    """
-
-    def __init__(self, name=None):
-        super(DetectionMAP, self).__init__(name)
-        # the current map value
-        self.value = .0
-        self.weight = .0
-
-    def update(self, value, weight):
-        if not _is_number_or_matrix_(value):
-            raise ValueError(
-                "The 'value' must be a number(int, float) or a numpy ndarray.")
-        if not _is_number_(weight):
-            raise ValueError("The 'weight' must be a number(int, float).")
-        self.value += value
-        self.weight += weight
-
-    def eval(self):
-        if self.weight == 0:
-            raise ValueError(
-                "There is no data in DetectionMAP Metrics. "
-                "Please check layers.detection_map output has added to DetectionMAP."
-            )
-        return self.value / self.weight
-
-
 class Auc(MetricBase):
     """
     Auc metric adapts to the binary classification.
@@ -616,3 +559,179 @@ class Auc(MetricBase):
             idx -= 1
 
         return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
+
+
+class DetectionMAP(object):
+    """
+    Calculate the detection mean average precision (mAP).
+
+    The general steps are as follows:
+    1. calculate the true positive and false positive according to the input
+        of detection and labels.
+    2. calculate mAP value, support two versions: '11 point' and 'integral'.
+
+    Please get more information from the following articles:
+      https://sanchom.wordpress.com/tag/average-precision/
+      https://arxiv.org/abs/1512.02325
+
+    Args:
+        input (Variable): The detection results, which is a LoDTensor with shape
+            [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
+        gt_label (Variable): The ground truth label index, which is a LoDTensor
+            with shape [N, 1].
+        gt_box (Variable): The ground truth bounding box (bbox), which is a
+            LoDTensor with shape [N, 4]. The layout is [xmin, ymin, xmax, ymax].
+        gt_difficult (Variable|None): Whether this ground truth is a difficult
+            bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
+            it means all the ground truth labels are not difficult bbox.
+        class_num (int): The class number.
+        background_label (int): The index of background label, the background
+            label will be ignored. If set to -1, then all categories will be
+            considered, 0 by defalut.
+        overlap_threshold (float): The threshold for deciding true/false
+            positive, 0.5 by defalut.
+        evaluate_difficult (bool): Whether to consider difficult ground truth
+            for evaluation, True by defalut. This argument does not work when
+            gt_difficult is None.
+        ap_version (string): The average precision calculation ways, it must be
+            'integral' or '11point'. Please check
+            https://sanchom.wordpress.com/tag/average-precision/ for details.
+            - 11point: the 11-point interpolated average precision.
+            - integral: the natural integral of the precision-recall curve.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(place)
+            map_evaluator = fluid.Evaluator.DetectionMAP(input,
+                gt_label, gt_box, gt_difficult)
+            cur_map, accum_map = map_evaluator.get_map_var()
+            fetch = [cost, cur_map, accum_map]
+            for epoch in PASS_NUM:
+                map_evaluator.reset(exe)
+                for data in batches:
+                    loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
+
+        In the above example:
+
+            'cur_map_v' is the mAP of current mini-batch.
+            'accum_map_v' is the accumulative mAP of one pass.
+    """
+
+    def __init__(self,
+                 input,
+                 gt_label,
+                 gt_box,
+                 gt_difficult=None,
+                 class_num=None,
+                 background_label=0,
+                 overlap_threshold=0.5,
+                 evaluate_difficult=True,
+                 ap_version='integral'):
+
+        self.helper = LayerHelper('map_eval')
+        gt_label = layers.cast(x=gt_label, dtype=gt_box.dtype)
+        if gt_difficult:
+            gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
+            label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        else:
+            label = layers.concat([gt_label, gt_box], axis=1)
+
+        # calculate mean average precision (mAP) of current mini-batch
+        map = layers.detection_map(
+            input,
+            label,
+            class_num,
+            background_label,
+            overlap_threshold=overlap_threshold,
+            evaluate_difficult=evaluate_difficult,
+            ap_version=ap_version)
+
+        states = []
+        states.append(
+            self._create_state(
+                dtype='int32', shape=None, suffix='accum_pos_count'))
+        states.append(
+            self._create_state(
+                dtype='float32', shape=None, suffix='accum_true_pos'))
+        states.append(
+            self._create_state(
+                dtype='float32', shape=None, suffix='accum_false_pos'))
+        var = self._create_state(dtype='int32', shape=[1], suffix='has_state')
+        self.helper.set_variable_initializer(
+            var, initializer=Constant(value=int(0)))
+        self.has_state = var
+
+        # calculate accumulative mAP
+        accum_map = layers.detection_map(
+            input,
+            label,
+            class_num,
+            background_label,
+            overlap_threshold=overlap_threshold,
+            evaluate_difficult=evaluate_difficult,
+            has_state=self.has_state,
+            input_states=states,
+            out_states=states,
+            ap_version=ap_version)
+
+        layers.fill_constant(
+            shape=self.has_state.shape,
+            value=1,
+            dtype=self.has_state.dtype,
+            out=self.has_state)
+
+        self.cur_map = map
+        self.accum_map = accum_map
+
+    def _create_state(self, suffix, dtype, shape):
+        """
+        Create state variable.
+        Args:
+            suffix(str): the state suffix.
+            dtype(str|core.VarDesc.VarType): the state data type
+            shape(tuple|list): the shape of state
+        Returns: State variable
+        """
+        state = self.helper.create_variable(
+            name="_".join([unique_name.generate(self.helper.name), suffix]),
+            persistable=True,
+            dtype=dtype,
+            shape=shape)
+        return state
+
+    def get_map_var(self):
+        """
+        Returns: mAP variable of current mini-batch and
+            accumulative mAP variable cross mini-batches.
+        """
+        return self.cur_map, self.accum_map
+
+    def reset(self, executor, reset_program=None):
+        """
+        Reset metric states at the begin of each pass/user specified batch.
+
+        Args:
+            executor(Executor): a executor for executing
+                the reset_program.
+            reset_program(Program|None): a single Program for reset process.
+                If None, will create a Program.
+        """
+
+        def _clone_var_(block, var):
+            assert isinstance(var, Variable)
+            return block.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                lod_level=var.lod_level,
+                persistable=var.persistable)
+
+        if reset_program is None:
+            reset_program = Program()
+        with program_guard(main_program=reset_program):
+            var = _clone_var_(reset_program.current_block(), self.has_state)
+            layers.fill_constant(
+                shape=var.shape, value=0, dtype=var.dtype, out=var)
+        executor.run(reset_program)
diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py
index 667db10d3e..4e1d1450de 100644
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -120,6 +120,8 @@ class OpDescCreationMethod(object):
                     new_attr.strings.extend(user_defined_attr)
                 elif attr.type == framework_pb2.BOOLEANS:
                     new_attr.bools.extend(user_defined_attr)
+                elif attr.type == framework_pb2.LONGS:
+                    new_attr.longs.extend(user_defined_attr)
                 elif attr.type == framework_pb2.INT_PAIRS:
                     for p in user_defined_attr:
                         pair = new_attr.int_pairs.add()
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 17af44afdd..7e2364a5a8 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import re
+import sys
 from collections import defaultdict
 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program
 from . import framework
@@ -32,7 +33,8 @@ __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
     'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
-    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'RMSPropOptimizer'
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum',
+    'LarsMomentumOptimizer'
 ]
 
 
@@ -105,13 +107,14 @@ class Optimizer(object):
         param = param_and_grad[0]
         param_lr = param.optimize_attr['learning_rate']
         if type(param_lr) == Variable:
-            print("returns updated param lr ", param_lr)
             return param_lr
         else:
             if param_lr == 1.0:
                 return self._global_learning_rate()
             else:
-                with default_main_program()._lr_schedule_guard():
+                with default_main_program()._lr_schedule_guard(
+                        is_with_opt=True), framework.name_scope(
+                            'scale_with_param_lr'):
                     return self._global_learning_rate() * param_lr
 
     def _create_accumulators(self, block, parameters):
@@ -398,6 +401,91 @@ class MomentumOptimizer(Optimizer):
         return momentum_op
 
 
+class LarsMomentumOptimizer(Optimizer):
+    """
+    Momentum optimizer with LARS support
+
+    The update equations are as follows:
+
+    .. math::
+
+        & local\_learning\_rate = learning\_rate * lars\_coeff * \\
+          \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
+
+        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param)
+
+        & param = param - velocity
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        momentum (float): momentum factor
+        lars_coeff (float): defines how much we trust the layer to change its weights.
+        lars_weight_decay (float): weight decay coefficient for decaying using LARS.
+        regularization: A Regularizer, such as
+                        fluid.regularizer.L2DecayRegularizer.
+        name: A optional name prefix.
+        
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.LarsMomentum(learning_rate=0.2, momentum=0.1, lars_weight_decay=0.001)
+            optimizer.minimize(cost)
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 lars_coeff=0.001,
+                 lars_weight_decay=0.0005,
+                 regularization=None,
+                 name=None):
+        assert learning_rate is not None
+        assert momentum is not None
+        super(LarsMomentumOptimizer, self).__init__(
+            learning_rate=learning_rate,
+            regularization=regularization,
+            name=name)
+        self.type = "lars_momentum"
+        self._momentum = momentum
+        self._lars_coeff = float(lars_coeff)
+        self._lars_weight_decay = float(lars_weight_decay)
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._velocity_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Velocity": velocity_acc,
+                "LearningRate": self._create_param_lr(param_and_grad)
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "VelocityOut": velocity_acc
+            },
+            attrs={
+                "mu": self._momentum,
+                "lars_coeff": self._lars_coeff,
+                "lars_weight_decay": self._lars_weight_decay
+            })
+
+        return momentum_op
+
+
 class AdagradOptimizer(Optimizer):
     """
     **Adaptive Gradient Algorithm (Adagrad)**
@@ -602,7 +690,8 @@ class AdamOptimizer(Optimizer):
         for param, grad in param_and_grads:
             if grad is None:
                 continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope("optimizer"):
                 beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                       param)
                 beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
@@ -740,7 +829,8 @@ class AdamaxOptimizer(Optimizer):
         for param, grad in parameters_and_grads:
             if grad is None:
                 continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('adamx'):
                 beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                       param)
                 main_block.append_op(
@@ -1217,6 +1307,7 @@ DecayedAdagrad = DecayedAdagradOptimizer
 Adadelta = AdadeltaOptimizer
 RMSProp = RMSPropOptimizer
 Ftrl = FtrlOptimizer
+LarsMomentum = LarsMomentumOptimizer
 
 
 class ModelAverage(Optimizer):
@@ -1279,7 +1370,8 @@ class ModelAverage(Optimizer):
         for param, grad in self.params_grads:
             if grad is None:
                 continue
-            with param.block.program._optimized_guard([param, grad]):
+            with param.block.program._optimized_guard(
+                [param, grad]), name_scope('move_average'):
                 self._append_average_accumulate_op(param)
 
         self.apply_program = Program()
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 97644df007..57185da4d1 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -47,7 +47,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
         if grad is None:
             params_and_grads.append((param, grad))
             continue
-        with param.block.program._optimized_guard([param, grad]):
+        with param.block.program._optimized_guard(
+            [param, grad]), framework.name_scope('regularization'):
             regularization_term = None
             if param.regularizer is not None:
                 # Add variable for regularization term in grad block
@@ -151,7 +152,7 @@ class L2DecayRegularizer(WeightDecayRegularizer):
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
-                type=core.VarDesc.VarType.SELECTED_ROWS)
+                type=core.VarDesc.VarType.LOD_TENSOR)
             block.append_op(
                 type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
@@ -228,7 +229,7 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             decay = block.create_var(
                 dtype="float32",
                 shape=param.shape,
-                type=core.VarDesc.VarType.SELECTED_ROWS)
+                type=core.VarDesc.VarType.LOD_TENSOR)
             block.append_op(
                 type='extract_rows', inputs={'X': grad}, outputs={'Out': idx})
             block.append_op(
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index d6568cd38e..d24417bbac 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -1,9 +1,3 @@
-if(NOT APPLE)
-  set(PYTHON_TESTS_DIR ${CMAKE_CURRENT_BINARY_DIR} CACHE PATH "python tests directory")
-else()
-  set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
-endif(NOT APPLE)
-
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
index 673c965b66..91c1d17eb5 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
@@ -1,7 +1,19 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
+if(NOT APPLE)
+    # default test
+    foreach(src ${TEST_OPS})
+        py_test(${src} SRCS ${src}.py)
+    endforeach()
+else()
+    foreach(src ${TEST_OPS})
+        if(${src} STREQUAL "test_image_classification_vgg")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_image_classification_resnet")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif()
+            py_test(${src} SRCS ${src}.py)
+        endif()
+    endforeach()
+endif()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 56129641ce..28dc751957 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -301,7 +301,7 @@ class TestRpnTargetAssign(unittest.TestCase):
                 dtype='float32',
                 lod_level=1,
                 append_batch_size=False)
-            pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
+            pred_scores, pred_loc, tgt_lbl, tgt_bbox, bbox_inside_weight = layers.rpn_target_assign(
                 bbox_pred=bbox_pred,
                 cls_logits=cls_logits,
                 anchor_box=anchor_box,
@@ -313,15 +313,18 @@ class TestRpnTargetAssign(unittest.TestCase):
                 rpn_straddle_thresh=0.0,
                 rpn_fg_fraction=0.5,
                 rpn_positive_overlap=0.7,
-                rpn_negative_overlap=0.3)
+                rpn_negative_overlap=0.3,
+                use_random=False)
 
             self.assertIsNotNone(pred_scores)
             self.assertIsNotNone(pred_loc)
             self.assertIsNotNone(tgt_lbl)
             self.assertIsNotNone(tgt_bbox)
+            self.assertIsNotNone(bbox_inside_weight)
             assert pred_scores.shape[1] == 1
             assert pred_loc.shape[1] == 4
             assert pred_loc.shape[1] == tgt_bbox.shape[1]
+            print(str(program))
 
 
 class TestGenerateProposals(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 7de0ebce06..2e87d8f4b4 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
     LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
     LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)
 
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
@@ -55,6 +59,7 @@ function(py_test_modules TARGET_NAME)
     if (py_test_modules_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
   endif()
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
@@ -78,9 +83,9 @@ if(WITH_DISTRIBUTE)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
         py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
         set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
-        # TODO: fix this test
-        #py_test_modules(test_dist_transformer MODULES test_dist_transformer)
-        #set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
+        # FIXME(typhoonzero): add this back
+	#py_test_modules(test_dist_transformer MODULES test_dist_transformer)
+	#set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
     endif(NOT APPLE)
     py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
@@ -88,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
-py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
+if(NOT APPLE)
+    py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 877d21ae88..1cda2711f7 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -90,12 +90,14 @@ class TestDistMnist2x2(TestDistRunnerBase):
 
         inference_program = fluid.default_main_program().clone()
         # Optimization
-        opt = fluid.optimizer.AdamOptimizer(
-            learning_rate=0.001, beta1=0.9, beta2=0.999)
+        # TODO(typhoonzero): fix distributed adam optimizer
+        # opt = fluid.optimizer.AdamOptimizer(
+        #     learning_rate=0.001, beta1=0.9, beta2=0.999)
+        opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
 
         # Reader
         train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size)
+            paddle.dataset.mnist.test(), batch_size=batch_size)
         test_reader = paddle.batch(
             paddle.dataset.mnist.test(), batch_size=batch_size)
         opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
new file mode 100644
index 0000000000..d386e75fd8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
@@ -0,0 +1,80 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+from dist_mnist import cnn_model
+
+DTYPE = "float32"
+
+
+def test_merge_reader(repeat_batch_size=8):
+    orig_reader = paddle.dataset.mnist.test()
+    record_batch = []
+    b = 0
+    for d in orig_reader():
+        if b >= repeat_batch_size:
+            break
+        record_batch.append(d)
+        b += 1
+    while True:
+        for d in record_batch:
+            yield d
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+        # Optimization
+        opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
+
+        # Reader
+        train_reader = paddle.batch(test_merge_reader, batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        opt.minimize(avg_cost)
+        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
new file mode 100644
index 0000000000..977e17c37f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+from dist_mnist import cnn_model
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+        # Optimization
+        opt = fluid.optimizer.LarsMomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+
+        # Reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        opt.minimize(avg_cost)
+        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
new file mode 100644
index 0000000000..edc6055005
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -0,0 +1,174 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import signal
+import subprocess
+import argparse
+import time
+import math
+import random
+from multiprocessing import Process
+from functools import reduce
+
+import numpy as np
+import unittest
+import six
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid import io
+
+from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP
+from dist_simnet_bow import TestDistSimnetBow2x2, DATA_URL, DATA_MD5
+
+
+class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
+    def _load_persistable_vars(self, executor, dirname, program):
+        def _is_checkpoint_var(var):
+            """
+            the checkpoint will not save or load all the variables.
+            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+            : param var(Variable)
+            """
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                    var.desc.type() == core.VarDesc.VarType.RAW:
+                return False
+            # @GRAD are named for gradient variables, checkpoint will not save it.
+            if "@GRAD" in var.name:
+                return False
+            # .trainer_ are named for distribute train variables, checkpoint will not save it.
+            if ".trainer_" in var.name:
+                return False
+
+            # .block is named for distribute train variables, checkpoint will not save it.
+            if ".block" in var.name:
+                return False
+
+            if "tmp_" in var.name:
+                return False
+
+            return var.persistable
+
+        io.load_vars(
+            executor,
+            dirname=dirname,
+            main_program=program,
+            predicate=_is_checkpoint_var,
+            filename=None)
+
+    def run_pserver(self, args):
+        self.get_model(batch_size=2)
+        # NOTE: pserver should not call memory optimize
+        t = self.get_transpiler(args.trainer_id,
+                                fluid.default_main_program(), args.endpoints,
+                                args.trainers, args.sync_mode)
+        pserver_prog = t.get_pserver_program(args.current_endpoint)
+        startup_prog = t.get_startup_program(args.current_endpoint,
+                                             pserver_prog)
+
+        need_load = bool(int(os.getenv("LOAD", "0")))
+        model_dir = os.getenv("MODEL_DIR", "")
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+
+        if need_load and model_dir:
+            self._load_persistable_vars(exe, model_dir, startup_prog)
+        exe.run(pserver_prog)
+
+    def run_trainer(self, args):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
+            self.get_model(batch_size=2)
+
+        if args.mem_opt:
+            fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
+        if args.is_dist:
+            t = self.get_transpiler(args.trainer_id,
+                                    fluid.default_main_program(),
+                                    args.endpoints, args.trainers,
+                                    args.sync_mode)
+
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        if args.use_cuda:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+
+        build_stra = fluid.BuildStrategy()
+
+        if args.use_reduce:
+            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        else:
+            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+
+        exe = fluid.ParallelExecutor(
+            args.use_cuda,
+            loss_name=avg_cost.name,
+            exec_strategy=strategy,
+            build_strategy=build_stra)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.values()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = train_reader()
+
+        def get_data():
+            origin_batch = next(reader_generator)
+            if args.is_dist and args.use_reader_alloc:
+                new_batch = []
+                for offset, item in enumerate(origin_batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
+            else:
+                return origin_batch
+
+        need_save = bool(int(os.getenv("SAVE", "0")))
+        model_dir = os.getenv("MODEL_DIR", "")
+
+        if need_save:
+            for _ in six.moves.xrange(RUN_STEP):
+                loss, = exe.run(fetch_list=[avg_cost.name],
+                                feed=feeder.feed(get_data()))
+            if need_save and model_dir:
+                io.save_persistables(startup_exe, model_dir, trainer_prog)
+
+        var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
+        print(np.ravel(var).tolist())
+
+
+if __name__ == "__main__":
+    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
+    runtime_main(TestDistSaveLoad2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index a2cc574258..27c67edf4f 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -35,7 +35,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
-from test_dist_base import TestDistRunnerBase, runtime_main
+from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP
 import paddle.compat as cpt
 from paddle.compat import long_type
 
@@ -562,18 +562,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
     for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
         pass_start_time = time.time()
         for batch_id, data in enumerate(train_data()):
-            if batch_id >= 5:
+            if batch_id >= RUN_STEP:
                 break
 
             feed_list = []
             total_num_token = 0
 
-            #if TrainTaskConfig.local:
-            #    lr_rate = lr_scheduler.update_learning_rate()
-            #for place_id, data_buffer in enumerate(
-            #        split_data(
-            #            data, num_part=dev_count)):
-
             if TrainTaskConfig.local:
                 lr_rate = lr_scheduler.update_learning_rate()
 
@@ -619,12 +613,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
             init = True
 
             # Validate and save the model for inference.
-            if batch_id == 0 or batch_id == 4:
-                if TrainTaskConfig.val_file_pattern is not None:
-                    val_avg_cost, val_ppl = test()
-                    print("[%f]" % val_avg_cost)
-                else:
-                    assert (False)
+            if TrainTaskConfig.val_file_pattern is not None:
+                val_avg_cost, val_ppl = test()
+                print("[%f]" % val_avg_cost)
+            else:
+                assert (False)
 
 
 #import transformer_reader as reader
@@ -1166,6 +1159,7 @@ def prepare_encoder(src_word,
             name=pos_enc_param_name,
             trainable=False,
             initializer=fluid.initializer.ConstantInitializer(0.001)))
+    src_pos_enc.stop_gradient = True
     enc_input = src_word_emb + src_pos_enc
     return layers.dropout(
         enc_input,
@@ -1701,7 +1695,7 @@ class DistTransformer2x2(TestDistRunnerBase):
 
     def run_trainer(self, args):
         TrainTaskConfig.use_gpu = args.use_cuda
-        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
+        sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
             args.is_dist, not args.sync_mode)
 
         if args.is_dist:
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index ee291fe746..a3fe5e0a05 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -40,7 +40,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   use_reduce=False,
                                   fuse_elewise_add_act_ops=False,
                                   optimizer=fluid.optimizer.Adam,
-                                  use_fast_executor=False):
+                                  use_fast_executor=False,
+                                  enable_sequential_execution=False):
         def run_executor(exe, feed, fetch_list, program=None):
             if isinstance(exe, fluid.ParallelExecutor):
                 res = exe.run(fetch_list=fetch_list, feed=feed)
@@ -80,6 +81,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                 if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
             build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+            build_strategy.enable_sequential_execution = enable_sequential_execution
 
             if use_parallel_executor:
                 exe = fluid.ParallelExecutor(
diff --git a/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
new file mode 100644
index 0000000000..3f2a337930
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
@@ -0,0 +1,134 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import math
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestAddPositionEncodingTensorOp(OpTest):
+    """
+    This class is to test the AddPositionEncodingOp
+    """
+
+    def setUp(self):
+        """
+        the prepared section for add position encoding op
+        """
+        self.op_type = "add_position_encoding"
+        self.dtype = np.float32
+        self.init_input_output()
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x), }
+        self.outputs = {'Out': self.out}
+        self.attrs = {'alpha': self.alpha, 'beta': self.beta}
+
+    def test_check_output(self):
+        """
+        check the correctness of output
+        """
+        self.check_output()
+
+    def test_check_grad(self):
+        """
+        check the correctness of grad
+        """
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+
+    def init_input_output(self):
+        """
+        init the input and output for test cases
+        """
+        self.alpha = 0.6
+        self.beta = 0.5
+        self.x = np.random.uniform(0.1, 1, [2, 4, 4]).astype(self.dtype)
+        self.out = np.copy(self.x)
+
+        batch_size = self.x.shape[0]
+        max_length = self.x.shape[1]
+        enc_size = self.x.shape[2]
+
+        half_shape = int(enc_size / 2)
+        for i in range(batch_size):
+            for j in range(max_length):
+                for k in range(half_shape):
+                    val = j / pow(10000.0, k / (
+                        half_shape - 1)) if half_shape > 1 else j / 10000.0
+                    self.out[i, j, k] = \
+                        self.x[i, j, k] * self.alpha + math.sin(val) * self.beta
+                    self.out[i, j, half_shape + k] = \
+                        self.x[i, j, half_shape + k] * self.alpha + math.cos(val) * self.beta
+
+
+class TestAddPositionEncodingLoDTensorOp(OpTest):
+    """
+    This class is to test the AddPositionEncodingLoDTensorOp
+    """
+
+    def setUp(self):
+        """
+        the prepared section for add position encoding LoDTensor op
+        """
+        self.op_type = "add_position_encoding"
+        self.dtype = np.float32
+        self.init_input_output()
+
+        self.inputs = {'X': (self.x, self.lod), }
+        self.outputs = {'Out': (self.out, self.lod)}
+        self.attrs = {'alpha': self.alpha, 'beta': self.beta}
+
+    def test_check_output(self):
+        """
+        check the correctness of output
+        """
+        self.check_output()
+
+    def test_check_grad(self):
+        """
+        check the correctness of grad
+        """
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+
+    def init_input_output(self):
+        """
+        init the input and output for test cases
+        """
+        self.alpha = 0.6
+        self.beta = 0.5
+        self.x = np.random.uniform(0.1, 1, [10, 4]).astype(self.dtype)
+        self.lod = [[3, 7]]
+        self.out = np.copy(self.x)
+
+        batch_size = len(self.lod[0])
+        enc_size = self.x.shape[1]
+
+        start = 0
+        half_shape = int(enc_size / 2)
+        for i in range(batch_size):
+            max_length = self.lod[0][i]
+            for j in range(max_length):
+                for k in range(half_shape):
+                    val = j / pow(10000.0, k / (
+                        half_shape - 1)) if half_shape > 1 else j / 10000.0
+                    pos = start + j
+                    self.out[pos, k] = \
+                        self.x[pos, k] * self.alpha + math.sin(val) * self.beta
+                    self.out[pos, half_shape + k] = \
+                        self.x[pos, half_shape + k] * self.alpha + math.cos(val) * self.beta
+            start += max_length
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
new file mode 100644
index 0000000000..576d00940c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -0,0 +1,79 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def AffineGrid(theta, size):
+    n = size[0]
+    w = size[3]
+    h = size[2]
+    h_idx = np.repeat(
+        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
+    w_idx = np.repeat(
+        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
+    grid = np.concatenate(
+        [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
+    grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
+
+    ret = np.zeros([n, h * w, 2])
+    theta = theta.transpose([0, 2, 1])
+    for i in range(len(theta)):
+        ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
+
+#    print ret.reshape([h * w, 2]).astype("float32")    
+    return ret.reshape([n, h, w, 2]).astype("float32")
+
+
+class TestAffineGridOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "affine_grid"
+        theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
+        theta = np.ones(self.theta_shape).astype("float32")
+        self.inputs = {'Theta': theta}
+        self.attrs = {"use_cudnn": True}
+        if self.dynamic_shape:
+            self.inputs['OutputShape'] = self.output_shape
+        else:
+            self.attrs['output_shape'] = self.output_shape
+        self.outputs = {'Output': AffineGrid(theta, self.output_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(
+            ['Theta'],
+            'Output',
+            no_grad_set=['OutputShape'],
+            max_relative_error=0.006)
+
+    def initTestCase(self):
+        self.theta_shape = (3, 2, 3)
+        self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = False
+
+
+class TestAffineGridOpCase1(TestAffineGridOp):
+    def initTestCase(self):
+        self.theta_shape = (3, 2, 3)
+        self.output_shape = np.array([3, 2, 5, 7]).astype("int32")
+        self.dynamic_shape = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 04924bec05..07814bc257 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -22,14 +22,17 @@ import signal
 import subprocess
 import six
 import argparse
+import pickle
+import numpy as np
 
 import paddle.fluid as fluid
 
 RUN_STEP = 10
+DEFAULT_BATCH_SIZE = 2
 
 
 class TestDistRunnerBase(object):
-    def get_model(self, batch_size=2):
+    def get_model(self, batch_size=DEFAULT_BATCH_SIZE):
         raise NotImplementedError(
             "get_model should be implemented by child classes.")
 
@@ -48,8 +51,7 @@ class TestDistRunnerBase(object):
         return t
 
     def run_pserver(self, args):
-
-        self.get_model(batch_size=2)
+        self.get_model(batch_size=args.batch_size)
         # NOTE: pserver should not call memory optimize
         t = self.get_transpiler(args.trainer_id,
                                 fluid.default_main_program(), args.endpoints,
@@ -65,7 +67,7 @@ class TestDistRunnerBase(object):
 
     def run_trainer(self, args):
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
-            self.get_model(batch_size=2)
+            self.get_model(batch_size=args.batch_size)
 
         if args.mem_opt:
             fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
@@ -92,6 +94,11 @@ class TestDistRunnerBase(object):
         strategy.allow_op_delay = False
 
         build_stra = fluid.BuildStrategy()
+        if args.batch_merge_repeat > 1:
+            pass_builder = build_stra._create_passes_from_strategy()
+            mypass = pass_builder.insert_pass(
+                len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
+            mypass.set_int("num_repeats", args.batch_merge_repeat)
 
         if args.use_reduce:
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
@@ -123,10 +130,15 @@ class TestDistRunnerBase(object):
             else:
                 return origin_batch
 
+        out_losses = []
         for _ in six.moves.xrange(RUN_STEP):
             loss, = exe.run(fetch_list=[avg_cost.name],
                             feed=feeder.feed(get_data()))
-            print(loss)
+            out_losses.append(loss[0])
+        if six.PY2:
+            print(pickle.dumps(out_losses))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out_losses))
 
 
 def runtime_main(test_class):
@@ -144,7 +156,10 @@ def runtime_main(test_class):
     parser.add_argument('--use_cuda', action='store_true')
     parser.add_argument('--use_reduce', action='store_true')
     parser.add_argument(
-        '--use_reader_alloc', action='store_true', required=False, default=True)
+        '--use_reader_alloc', action='store_true', required=False)
+    parser.add_argument('--batch_size', required=False, type=int, default=2)
+    parser.add_argument(
+        '--batch_merge_repeat', required=False, type=int, default=1)
 
     args = parser.parse_args()
 
@@ -180,7 +195,7 @@ class TestDistBase(unittest.TestCase):
         self._pservers = 2
         self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
             self._find_free_port(), self._find_free_port())
-        self._python_interp = "python"
+        self._python_interp = sys.executable
         self._sync_mode = True
         self._enforce_place = None
         self._mem_opt = False
@@ -229,24 +244,18 @@ class TestDistBase(unittest.TestCase):
 
         return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
 
-    def _wait_ps_ready(self, pid):
-        retry_times = 50
-        while True:
-            assert retry_times >= 0, "wait ps ready failed"
-            time.sleep(3)
-            try:
-                # the listen_and_serv_op would touch a file which contains the listen port
-                # on the /tmp directory until it was ready to process all the RPC call.
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error as e:
-                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
-                                 (e, retry_times))
-                retry_times -= 1
-
-    def _run_local(self, model, envs, check_error_log):
+    def _run_local(self,
+                   model,
+                   envs,
+                   check_error_log=False,
+                   batch_size=DEFAULT_BATCH_SIZE,
+                   batch_merge_repeat=1):
 
         cmd = "%s %s --role trainer" % (self._python_interp, model)
+        if batch_size != DEFAULT_BATCH_SIZE:
+            cmd += " --batch_size %d" % batch_size
+        if batch_merge_repeat > 1:
+            cmd += " --batch_merge_repeat %d" % batch_merge_repeat
 
         if self.__use_cuda:
             cmd += " --use_cuda"
@@ -271,23 +280,20 @@ class TestDistBase(unittest.TestCase):
                 env=envs)
 
         local_out, local_err = local_proc.communicate()
-        local_ret = cpt.to_text(local_out)
 
         if check_error_log:
             err_log.close()
 
-        sys.stderr.write('local_stdout: %s\n' % local_ret)
+        sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
         sys.stderr.write('local_stderr: %s\n' % local_err)
 
-        local_losses = local_ret.split("\n")
-        return local_losses
+        return pickle.loads(local_out)
 
     def _run_cluster(self, model, envs, check_error_log):
         # Run dist train to compare with local results
         ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model,
                                                           check_error_log, envs)
-        self._wait_ps_ready(ps0.pid)
-        self._wait_ps_ready(ps1.pid)
+
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
 
         tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
@@ -322,8 +328,8 @@ class TestDistBase(unittest.TestCase):
         env0.update(envs)
         env1.update(envs)
 
-        print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0))
-        print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1))
+        print("tr0_cmd:{}".format(tr0_cmd))
+        print("tr1_cmd:{}".format(tr1_cmd))
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
@@ -339,9 +345,7 @@ class TestDistBase(unittest.TestCase):
             env=env1)
 
         tr0_out, tr0_err = tr0_proc.communicate()
-        tr0_loss_text = cpt.to_text(tr0_out)
         tr1_out, tr1_err = tr1_proc.communicate()
-        tr1_loss_text = cpt.to_text(tr1_out)
 
         # close trainer file
         tr0_pipe.close()
@@ -356,15 +360,13 @@ class TestDistBase(unittest.TestCase):
         ps1.terminate()
 
         # print log
-        sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text)
-        sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text)
+        sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
         sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
 
-        tr0_losses = tr0_loss_text.split("\n")
-        tr1_losses = tr1_loss_text.split("\n")
-
-        return tr0_losses, tr1_losses
+        # return tr0_losses, tr1_losses
+        return pickle.loads(tr0_out), pickle.loads(tr1_out)
 
     def check_with_place(self,
                          model_file,
@@ -394,9 +396,9 @@ class TestDistBase(unittest.TestCase):
                                                    check_error_log)
 
         for step_id in range(RUN_STEP):
-            local_loss = eval(local_losses[step_id])[0]
-            tr0_loss = eval(tr0_losses[step_id])[0]
-            tr1_loss = eval(tr1_losses[step_id])[0]
-            dist_loss = (tr0_loss + tr1_loss) / 2
-            print(str(local_loss) + ":" + str(dist_loss))
-            self.assertAlmostEqual(local_loss, dist_loss, delta=delta)
+            local_loss = local_losses[step_id]
+            tr0_loss = tr0_losses[step_id]
+            tr1_loss = tr1_losses[step_id]
+            dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
+            print("=======", local_loss, ":", dist_loss[0], "=======")
+            self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
index 3575fd07fc..b2d979729b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py
@@ -18,14 +18,14 @@ import unittest
 from test_dist_base import TestDistBase
 
 
+# FIXME(tangwei): sum op can not handle when inputs is empty.
 class TestDistCTR2x2(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
         self._enforce_place = "CPU"
 
-
-def test_dist_ctr(self):
-    self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
+    def test_dist_ctr(self):
+        self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index f65dd7e2a2..922dd838f8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -26,6 +26,15 @@ class TestDistMnist2x2(TestDistBase):
         self.check_with_place("dist_mnist.py", delta=1e-5)
 
 
+class TestDistMnist2x2Lars(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_mnist_lars.py", delta=1e-5)
+
+
 class TestDistMnist2x2WithMemopt(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
new file mode 100644
index 0000000000..22d4b79290
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+import os
+
+
+class TestDistMnist2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+
+    def test_dist_train(self):
+        self.check_with_place("dist_mnist_batch_merge.py", delta=1e-5)
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        # TODO(typhoonzero): should auto adapt GPU count on the machine.
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_cudnn_deterministic": "1",
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        no_merge_losses = self._run_local(
+            model_file,
+            required_envs,
+            check_error_log=check_error_log,
+            batch_size=4)
+
+        batch_merge_losses = self._run_local(
+            model_file,
+            required_envs,
+            check_error_log=check_error_log,
+            batch_size=2,
+            batch_merge_repeat=2)
+        # Ensure both result have values.
+        self.assertGreater(len(no_merge_losses), 1)
+        self.assertEqual(len(no_merge_losses), len(batch_merge_losses))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
new file mode 100644
index 0000000000..03066fee48
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -0,0 +1,90 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import os
+import shutil
+import unittest
+import tempfile
+
+import numpy as np
+
+from test_dist_base import TestDistBase, RUN_STEP
+
+
+class TestDistSaveLoadDense2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "http_proxy": ""
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        model_dir = tempfile.mkdtemp()
+
+        local_env = {}
+        local_env["SAVE"] = "1"
+        local_env["MODEL_DIR"] = model_dir
+        local_env.update(required_envs)
+
+        cluster_env = {}
+        cluster_env["LOAD"] = "1"
+        cluster_env["MODEL_DIR"] = model_dir
+        cluster_env.update(required_envs)
+
+        local_var = self._run_local(model_file, local_env, check_error_log)
+        tr0_var, tr1_var = self._run_cluster(model_file, cluster_env,
+                                             check_error_log)
+
+        shutil.rmtree(model_dir)
+
+        local_np = np.array(eval(local_var[0]))
+        train0_np = np.array(eval(tr0_var[0]))
+        train1_np = np.array(eval(tr1_var[0]))
+        self.assertAlmostEqual(local_np.all(), train0_np.all(), delta=delta)
+        self.assertAlmostEqual(local_np.all(), train1_np.all(), delta=delta)
+        self.assertAlmostEqual(train0_np.all(), train1_np.all(), delta=delta)
+
+    @unittest.skip(reason="CI fail")
+    def test_dist(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1'
+        }
+        self.check_with_place(
+            "dist_save_load.py",
+            delta=0,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index c0989ca709..c2a4e5ca0c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase):
         self._use_reader_alloc = False
 
     def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=100)
+        self.check_with_place("dist_se_resnext.py", delta=1e-7)
 
 
 class TestDistseResnXt2x2WithMemopt(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
         self._mem_opt = True
+        self._use_reader_alloc = False
 
     def test_dist_train(self):
-        self.check_with_place("dist_se_resnext.py", delta=100)
+        self.check_with_place("dist_se_resnext.py", delta=1e-7)
 
 
 class TestDistSeResneXt2x2Async(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
index a0b6879f99..102a4dab05 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
@@ -42,7 +42,7 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
         self._sync_mode = False
         self._enforce_place = "CPU"
 
-    def test_simnet_bow(self):
+    def no_test_simnet_bow(self):
         need_envs = {
             "IS_DISTRIBUTED": '0',
             "IS_SPARSE": '0',
@@ -92,7 +92,6 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
 
 
 # FIXME(tangwei): Learningrate variable is not created on pserver.
-"""
 class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
@@ -145,7 +144,7 @@ class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
             delta=1e-5,
             check_error_log=False,
             need_envs=need_envs)
-"""
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
index 47e8dfaf03..25dcccc28d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -61,7 +61,8 @@ class TestDistTransformer2x2Sync(TestDistBase):
 
     def test_dist_train(self):
         download_files()
-        self.check_with_place("dist_transformer.py", delta=1e-5)
+        self.check_with_place(
+            "dist_transformer.py", delta=1e-5, check_error_log=False)
 
 
 class TestDistTransformer2x2Async(TestDistBase):
@@ -70,7 +71,8 @@ class TestDistTransformer2x2Async(TestDistBase):
 
     def test_dist_train(self):
         download_files()
-        self.check_with_place("dist_transformer.py", delta=1.0)
+        self.check_with_place(
+            "dist_transformer.py", delta=1.0, check_error_log=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 54a1c68a37..986fdd9ff2 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -283,6 +283,25 @@ class TestDecayedAdagrad(TranspilerTest):
         trainer, _ = self.get_trainer()
 
 
+class TestFtrl(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        opt = fluid.optimizer.Ftrl(learning_rate=0.1)
+        opt.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer, _ = self.get_trainer()
+
+
 class TestLRDecayConditional(TranspilerTest):
     def net_conf(self):
         x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
@@ -405,18 +424,43 @@ class TestL2DecayWithPiecewise(TranspilerTest):
             ["sum", "scale", "scale", "elementwise_add", "momentum"])
 
 
+class TestEmptyPserverOptimizeBlocks(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        # only one parameter
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=False)
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=1.0)
+        sgd_optimizer.minimize(avg_cost)
+
+    def transpiler_test_impl(self):
+        config = fluid.DistributeTranspilerConfig()
+        config.slice_var_up = False
+
+        pserver, startup = self.get_pserver(ep=self.pserver2_ep, config=config)
+
+        self.assertEqual(len(pserver.blocks), 2)
+        self.assertEqual(len(pserver.blocks[1].ops), 0)
+
+
 class TestDistLookupTableBase(TranspilerTest):
     def network_with_table(self, is_sparse, is_distributed):
         self.table_size = 1000
         self.emb_size = 64
         self.lookup_table_name = 'shared_w'
 
-        def emb_pool(ids):
+        def emb_pool(ids, table_name, is_distributed):
             emb = fluid.layers.embedding(
                 input=ids,
                 size=[self.table_size, self.emb_size],
                 dtype='float32',
-                param_attr=self.lookup_table_name,  # share parameter
+                param_attr=table_name,
                 is_sparse=is_sparse,
                 is_distributed=is_distributed)
             pool = fluid.layers.sequence_pool(input=emb, pool_type='average')
@@ -426,9 +470,13 @@ class TestDistLookupTableBase(TranspilerTest):
             name='title_ids', shape=[1], dtype='int64', lod_level=1)
         brand_ids = fluid.layers.data(
             name='brand_ids', shape=[1], dtype='int64', lod_level=1)
-        title_emb = emb_pool(title_ids)
-        brand_emb = emb_pool(brand_ids)
-        fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1)
+        profile_ids = fluid.layers.data(
+            name='brand_ids', shape=[1], dtype='int64', lod_level=1)
+        title_emb = emb_pool(title_ids, self.lookup_table_name, is_distributed)
+        brand_emb = emb_pool(brand_ids, self.lookup_table_name, is_distributed)
+        profile_emb = emb_pool(profile_ids, "profile_emb", False)
+        fc0 = fluid.layers.concat(
+            input=[title_emb, brand_emb, profile_emb], axis=1)
         predict = fluid.layers.fc(input=fc0,
                                   size=2,
                                   act=None,
@@ -449,7 +497,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
     def transpiler_test_impl(self):
         pserver1, startup1 = self.get_pserver(self.pserver1_ep)
 
-        self.assertEqual(len(pserver1.blocks), 3)
+        self.assertEqual(len(pserver1.blocks), 4)
         # 0 listen_and_serv
         # 1 optimize for fc_w or fc_b adam
         self.assertEqual([op.type for op in pserver1.blocks[1].ops],
@@ -459,16 +507,23 @@ class TestLocalLookupTable(TestDistLookupTableBase):
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
                          ["sum", "scale", "adam", "scale", "scale"])
 
+        # 3 optimize for table 2 adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+
         trainer, _ = self.get_trainer()
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
             'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
-            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad',
-            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
-            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sum', 'split_selected_rows', 'send',
-            'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
+            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
+            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'split_selected_rows', 'send', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
+            'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
@@ -485,31 +540,43 @@ class TestDistLookupTable(TestDistLookupTableBase):
         # 1 optimize for fc_w or fc_b adam
         self.assertEqual([op.type for op in pserver1.blocks[1].ops],
                          ["sum", "scale", "adam", "scale", "scale"])
-        # 2 optimize for table sgd
+        # 4 prefetch -> lookup_sparse_table for data0
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
                          ["sum", "sgd"])
         # 3 prefetch -> lookup_sparse_table for data0
-        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
-                         ["lookup_sparse_table"])
-        # 4 prefetch -> lookup_sparse_table for data1
         self.assertEqual([op.type for op in pserver1.blocks[4].ops],
                          ["lookup_sparse_table"])
         # 5 save table
         self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
 
-        trainer, _ = self.get_trainer()
+        trainer, trainer_startup = self.get_trainer()
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
-            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
-            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
+            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
             'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
             'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
             'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'sum', 'split_ids', 'send', 'send_barrier', 'recv', 'recv',
-            'fetch_barrier'
+            'lookup_table_grad', 'split_selected_rows', 'send',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier',
+            'recv', 'recv', 'recv', 'fetch_barrier', 'concat'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+        startup_ops = [
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'uniform_random',
+            'uniform_random', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat',
+            'fake_init'
+        ]
+        self.assertEqual([op.type for op in trainer_startup.blocks[0].ops],
+                         startup_ops)
 
 
 class TestAsyncLocalLookupTable(TestDistLookupTableBase):
@@ -520,7 +587,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
         config = fluid.DistributeTranspilerConfig()
         pserver1, startup1 = self.get_pserver(self.pserver1_ep, config, False)
 
-        self.assertEqual(len(pserver1.blocks), 3)
+        self.assertEqual(len(pserver1.blocks), 4)
         # 0 listen_and_serv
         # 1 optimize for fc_w or fc_b adam
         self.assertEqual([op.type for op in pserver1.blocks[1].ops],
@@ -529,17 +596,23 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
         # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
                          ["adam", "scale", "scale"])
+        # 3 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["adam", "scale", "scale"])
 
         trainer, _ = self.get_trainer(config)
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
             'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
-            'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad',
-            'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad',
-            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'recv',
-            'recv', 'recv', 'concat'
+            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
+            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'split_selected_rows', 'send', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv',
+            'recv', 'concat', 'concat'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
@@ -558,12 +631,12 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
         # 1 optimize for fc_w or fc_b adam
         self.assertEqual([op.type for op in pserver1.blocks[1].ops],
                          ["adam", "scale", "scale"])
-        # 2 optimize for table sgd
-        self.assertEqual([op.type for op in pserver1.blocks[2].ops], ["sgd"])
-        # 3 prefetch -> lookup_sparse_table for data0
-        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
-                         ["lookup_sparse_table"])
-        # 4 prefetch -> lookup_sparse_table for data1
+        # 2 optimize for table adam
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["adam", "scale", "scale"])
+        # 3 optimize for table sgd
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops], ["sgd"])
+        # 4 prefetch -> lookup_sparse_table for data0
         self.assertEqual([op.type for op in pserver1.blocks[4].ops],
                          ["lookup_sparse_table"])
         # 5 save table
@@ -572,13 +645,15 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
         trainer, _ = self.get_trainer(config)
         self.assertEqual(len(trainer.blocks), 1)
         ops = [
-            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
-            'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul',
+            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool',
+            'sequence_pool', 'lookup_table', 'sequence_pool', 'concat', 'mul',
             'elementwise_add', 'cross_entropy', 'mean', 'fill_constant',
             'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send',
             'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad',
-            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
-            'sum', 'split_ids', 'send', 'recv', 'recv'
+            'lookup_table_grad', 'split_selected_rows', 'send',
+            'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad',
+            'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv',
+            'recv', 'concat'
         ]
         self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index 0296bc2af4..be3c5f3b95 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -85,6 +85,69 @@ class TestDropoutOp5(OpTest):
         self.check_output()
 
 
+class TestDropoutOp6(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {
+            'dropout_prob': 1.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': np.zeros((32, 64)).astype('float32'),
+            'Mask': np.zeros((32, 64)).astype('float32')
+        }
+
+
+class TestDropoutOp7(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64, 2)).astype('float32')
+        }
+
+
+class TestDropoutOp8(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {
+            'dropout_prob': 0.35,
+            'fix_seed': True,
+            'is_test': True,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {'Out': self.inputs['X']}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestDropoutOp9(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
+        self.attrs = {
+            'dropout_prob': 0.75,
+            'is_test': True,
+            'dropout_implementation': 'upscale_in_train'
+        }
+        self.outputs = {'Out': self.inputs['X']}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestFP16DropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
diff --git a/python/paddle/fluid/tests/unittests/test_fake_init_op.py b/python/paddle/fluid/tests/unittests/test_fake_init_op.py
new file mode 100644
index 0000000000..a62b7aed66
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fake_init_op.py
@@ -0,0 +1,52 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+class TestFakeInitOpSelectedRows(unittest.TestCase):
+    def check_with_place(self, place, is_selected_rows):
+        scope = core.Scope()
+
+        out_var_name = 'Out'
+        if is_selected_rows:
+            out_tensor = scope.var(out_var_name).get_selected_rows().get_tensor(
+            )
+        else:
+            out_tensor = scope.var(out_var_name).get_tensor()
+
+        var_shape = [4, 784]
+
+        # create and run fake_init_op
+        fake_init_op = Operator("fake_init", Out=out_var_name, shape=var_shape)
+        fake_init_op.run(scope, place)
+
+        self.assertEqual(var_shape, out_tensor._get_dims())
+
+    def test_fake_init_selected_rows(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            for is_selected_rows in [True, False]:
+                self.check_with_place(place, is_selected_rows)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
index 36ebc8fb6e..377454e780 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -125,6 +125,12 @@ class TestFusionGRUOpMD2(TestFusionGRUOp):
         self.D = 8
 
 
+class TestFusionGRUOpMD3(TestFusionGRUOp):
+    def set_confs(self):
+        self.M = 17
+        self.D = 15
+
+
 class TestFusionGRUOpBS1(TestFusionGRUOp):
     def set_confs(self):
         self.lod = [[3]]
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
new file mode 100644
index 0000000000..c2529e0d70
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -0,0 +1,123 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def AffineGrid(theta, size):
+    n = size[0]
+    h = size[2]
+    w = size[3]
+    h_idx = np.repeat(
+        np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[:, :, np.newaxis]
+    w_idx = np.repeat(
+        np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[:, :, np.newaxis]
+    grid = np.concatenate(
+        [w_idx, h_idx, np.ones([h, w, 1])], axis=2)  # h * w * 3
+    grid = np.repeat(grid[np.newaxis, :], size[0], axis=0)  # n * h * w *3
+
+    ret = np.zeros([n, h * w, 2])
+    theta = theta.transpose([0, 2, 1])
+    for i in range(len(theta)):
+        ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
+
+    return ret.reshape([n, h, w, 2]).astype("float32")
+
+
+def getGridPointValue(data, x, y):
+    data_shape = data.shape
+    N = data_shape[0]
+    H = data_shape[2]
+    W = data_shape[3]
+
+    out = np.zeros(data_shape, dtype='float')
+    for i in range(N):
+        for j in range(H):
+            for k in range(W):
+                if y[i, j, k] < 0 or y[i, j, k] > H - 1 or x[i, j, k] < 0 or x[
+                        i, j, k] > W - 1:
+                    out[i, :, j, k] = 0
+                else:
+                    out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]]
+
+    return out
+
+
+def GridSampler(data, grid):
+    dims = data.shape
+    N = dims[0]
+    C = dims[1]
+    H = dims[2]
+    W = dims[3]
+
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+    y_max = H - 1
+    x_max = W - 1
+
+    x = 0.5 * ((x.astype('float32') + 1.0) * x_max)
+    y = 0.5 * ((y.astype('float32') + 1.0) * y_max)
+
+    x0 = np.floor(x).astype('int32')
+    x1 = x0 + 1
+    y0 = np.floor(y).astype('int32')
+    y1 = y0 + 1
+
+    wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
+    wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
+    wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, H, W)), (1, C, 1, 1))
+    wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, H, W)), (1, C, 1, 1))
+
+    va = getGridPointValue(data, x0, y0)
+    vb = getGridPointValue(data, x0, y1)
+    vc = getGridPointValue(data, x1, y0)
+    vd = getGridPointValue(data, x1, y1)
+
+    out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float32')
+    return out
+
+
+class TestGridSamplerOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'grid_sampler'
+        x = np.random.randint(0, 255, self.x_shape).astype('float32')
+
+        theta = np.zeros(self.theta_shape).astype('float32')
+        for i in range(self.theta_shape[0]):
+            for j in range(2):
+                for k in range(3):
+                    theta[i, j, k] = np.random.rand(1)[0]
+        grid = AffineGrid(theta, self.x_shape)
+
+        self.inputs = {'X': x, 'Grid': grid}
+        self.attrs = {'use_cudnn': True}
+        self.outputs = {'Output': GridSampler(x, grid)}
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61)
+
+    def initTestCase(self):
+        self.x_shape = (2, 5, 7, 3)
+        self.grid_shape = (2, 7, 3, 2)
+        self.theta_shape = (2, 2, 3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hash_op.py b/python/paddle/fluid/tests/unittests/test_hash_op.py
new file mode 100644
index 0000000000..1130ea39c4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hash_op.py
@@ -0,0 +1,57 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestScaleOp(OpTest):
+    def setUp(self):
+        self.op_type = "hash"
+        self.init_test_case()
+        self.inputs = {'X': (self.in_seq, self.lod)}
+        self.attrs = {'num_hash': 4, 'mod_by': 10000}
+        self.outputs = {'Out': (self.out_seq, self.lod)}
+
+    def init_test_case(self):
+        np.random.seed = 1
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        #  self.out_seq = np.ones([30, 4, 1], dtype=np.int32)
+        self.out_seq = [
+            [[9662], [9217], [1129], [8487]], [[9662], [9217], [1129], [8487]],
+            [[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]],
+            [[9407], [6715], [6949], [8094]], [[8473], [694], [5142], [2479]],
+            [[8310], [1327], [1654], [4567]], [[6897], [3218], [2013], [1241]],
+            [[4372], [9456], [8204], [6695]], [[6897], [3218], [2013], [1241]],
+            [[8473], [694], [5142], [2479]], [[4372], [9456], [8204], [6695]],
+            [[4372], [9456], [8204], [6695]], [[8473], [694], [5142], [2479]],
+            [[9407], [6715], [6949], [8094]], [[9369], [4525], [8935], [9210]],
+            [[4372], [9456], [8204], [6695]], [[4372], [9456], [8204], [6695]],
+            [[9369], [4525], [8935], [9210]], [[6897], [3218], [2013], [1241]],
+            [[9038], [7951], [5953], [8657]], [[9407], [6715], [6949], [8094]],
+            [[9662], [9217], [1129], [8487]], [[9369], [4525], [8935], [9210]],
+            [[9038], [7951], [5953], [8657]], [[9662], [9217], [1129], [8487]],
+            [[9369], [4525], [8935], [9210]], [[1719], [5986], [9919], [3421]],
+            [[4372], [9456], [8204], [6695]], [[9038], [7951], [5953], [8657]]
+        ]
+        self.out_seq = np.array(self.out_seq)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 50de468dba..c4ecc2c2c2 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -865,6 +865,31 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_grid_sampler(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 5, 7], dtype='float32')
+            grid = layers.data(name='grid', shape=[5, 7, 2], dtype='float32')
+            out = layers.grid_sampler(x, grid)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_affine_grid(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[2, 3, 3], dtype="float32")
+            out, ids = layers.argsort(input=data, axis=1)
+
+            theta = layers.data(name="theta", shape=[2, 3], dtype="float32")
+            out_shape = layers.data(
+                name="out_shape", shape=[-1], dtype="float32")
+            data_0 = layers.affine_grid(theta, out_shape)
+            data_1 = layers.affine_grid(theta, [5, 3, 28, 28])
+
+            self.assertIsNotNone(data_0)
+            self.assertIsNotNone(data_1)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 48b52a5412..a0358f8b40 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -55,6 +55,46 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
     exe.run(pserver_prog)
 
 
+def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
+                                 trainer_id):
+    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None, bias_attr=False)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    # loss function
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    # optimizer
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    ps1 = ip + ":" + str(int(port) + 1)
+    ps2 = ip + ":" + port
+    pserver_endpoints = ps1 + "," + ps2
+
+    config = fluid.DistributeTranspilerConfig()
+    config.slice_var_up = False
+    t = fluid.DistributeTranspiler(config=config)
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=sync_mode)
+    pserver_prog = t.get_pserver_program(ps2)
+
+    # pserver2 have no parameter
+    assert (len(pserver_prog.blocks) == 2)
+    assert (len(pserver_prog.blocks[1].ops) == 0)
+
+    pserver_startup = t.get_startup_program(ps2, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+
+
 class TestListenAndServOp(OpTest):
     def setUp(self):
         self.ps_timeout = 5
@@ -63,9 +103,9 @@ class TestListenAndServOp(OpTest):
         self.trainers = 1
         self.trainer_id = 0
 
-    def _start_pserver(self, use_cuda, sync_mode):
+    def _start_pserver(self, use_cuda, sync_mode, pserver_func):
         p = Process(
-            target=run_pserver,
+            target=pserver_func,
             args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
                   self.trainer_id))
         p.daemon = True
@@ -92,7 +132,24 @@ class TestListenAndServOp(OpTest):
 
     def test_handle_signal_in_serv_op(self):
         # run pserver on CPU in sync mode
-        p1 = self._start_pserver(False, True)
+        p1 = self._start_pserver(False, True, run_pserver)
+        self._wait_ps_ready(p1.pid)
+
+        # raise SIGTERM to pserver
+        os.kill(p1.pid, signal.SIGINT)
+        p1.join()
+
+        # run pserver on CPU in async mode
+        p2 = self._start_pserver(False, False, run_pserver)
+        self._wait_ps_ready(p2.pid)
+
+        # raise SIGTERM to pserver
+        os.kill(p2.pid, signal.SIGTERM)
+        p2.join()
+
+    def test_list_and_serv_run_empty_optimize_block(self):
+        # run pserver on CPU in sync mode
+        p1 = self._start_pserver(False, True, run_pserver_with_empty_block)
         self._wait_ps_ready(p1.pid)
 
         # raise SIGTERM to pserver
@@ -100,7 +157,7 @@ class TestListenAndServOp(OpTest):
         p1.join()
 
         # run pserver on CPU in async mode
-        p2 = self._start_pserver(False, False)
+        p2 = self._start_pserver(False, False, run_pserver_with_empty_block)
         self._wait_ps_ready(p2.pid)
 
         # raise SIGTERM to pserver
diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
index 26ce702411..b109e4ea62 100644
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
@@ -22,15 +22,28 @@ from op_test import OpTest
 class TestMergeIdsOp(OpTest):
     def setUp(self):
         self.op_type = "merge_ids"
-        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
-        x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
-        x1 = np.array([]).astype('float32')
-        x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6],
-                       [0.5, 0.6]]).astype('float32')
-        out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3],
-                        [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32')
-        self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]}
-        self.outputs = {'Out': out}
+        ids1 = np.array([[0], [2], [5], [6]]).astype('int64')
+        ids2 = np.array([[0], [2], [2], [3]]).astype('int64')
+
+        rows1 = np.array([[0], [2]]).astype('int64')
+        rows2 = np.array([[3], [5]]).astype('int64')
+        rows3 = np.array([[6]]).astype('int64')
+
+        x0 = np.array([[0.1, 0.2], [0.2, 0.3]]).astype('float32')
+        x1 = np.array([[0.3, 0.4], [0.4, 0.5]]).astype('float32')
+        x2 = np.array([[0.5, 0.6]]).astype('float32')
+
+        out1 = np.array(
+            [[0.1, 0.2], [0.2, 0.3], [0.4, 0.5], [0.5, 0.6]]).astype('float32')
+        out2 = np.array(
+            [[0.1, 0.2], [0.2, 0.3], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
+
+        self.inputs = {
+            'Ids': [('ids1', ids1), ('ids2', ids2)],
+            "Rows": [('rows1', rows1), ('rows2', rows2), ('rows3', rows3)],
+            "X": [('x0', x0), ('x1', x1), ('x2', x2)]
+        }
+        self.outputs = {'Out': [('out1', out1), ('out2', out2)]}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/test_metrics.py b/python/paddle/fluid/tests/unittests/test_metrics.py
new file mode 100644
index 0000000000..ec27884cae
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_metrics.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.fluid as fluid
+from paddle.fluid.framework import Program, program_guard
+
+
+class TestMetricsDetectionMap(unittest.TestCase):
+    def test_detection_map(self):
+        program = fluid.Program()
+        with program_guard(program):
+            detect_res = fluid.layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = fluid.layers.data(
+                name='label',
+                shape=[10, 1],
+                append_batch_size=False,
+                dtype='float32')
+            box = fluid.layers.data(
+                name='bbox',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            map_eval = fluid.metrics.DetectionMAP(
+                detect_res, label, box, class_num=21)
+            cur_map, accm_map = map_eval.get_map_var()
+            self.assertIsNotNone(cur_map)
+            self.assertIsNotNone(accm_map)
+        print(str(program))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index a3d89610b4..cf4346cf2e 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -90,6 +90,45 @@ class TestMomentumOp2(OpTest):
         self.check_output()
 
 
+class TestLarsMomentumOp(OpTest):
+    def setUp(self):
+        self.op_type = "lars_momentum"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        velocity = np.zeros((123, 321)).astype("float32")
+        learning_rate = np.array([0.001]).astype("float32")
+        mu = 0.0001
+        lars_coeff = 0.001
+        lars_weight_decay = 0.0005
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Velocity': velocity,
+            'LearningRate': learning_rate
+        }
+
+        self.attrs = {
+            'mu': mu,
+            'lars_coeff': lars_coeff,
+            'lars_weight_decay': lars_weight_decay
+        }
+
+        pnorm = np.sqrt(np.square(param).sum())
+        gnorm = np.sqrt(np.square(grad).sum())
+        local_lr = learning_rate * lars_coeff * pnorm / (
+            gnorm + lars_weight_decay * param)
+        velocity_out = mu * velocity + local_lr * (grad + lars_weight_decay *
+                                                   param)
+        param_out = param - velocity_out
+
+        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestSparseMomentumOp(unittest.TestCase):
     def setUp(self):
         self.use_nesterov = False
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index cc2d692e18..e7a56bb638 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -232,6 +232,46 @@ class TestResnet(TestParallelExecutorBase):
         for loss in zip(all_reduce_last_loss, reduce_last_loss):
             self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
 
+        if not use_cuda:
+            return
+
+        all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=False,
+            optimizer=optimizer,
+            enable_sequential_execution=True)
+
+        reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            use_reduce=True,
+            optimizer=optimizer,
+            enable_sequential_execution=True)
+
+        for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+        for loss in zip(reduce_first_loss, reduce_first_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(reduce_last_loss, reduce_last_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
+        for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq):
+            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+
     def _check_resnet_convergence(self,
                                   model,
                                   use_cuda=True,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index a55b2002ed..3827743908 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -173,6 +173,8 @@ class TestTransformer(TestParallelExecutorBase):
     def test_main(self):
         if core.is_compiled_with_cuda():
             self.check_network_convergence(transformer, use_cuda=True)
+            self.check_network_convergence(
+                transformer, use_cuda=True, enable_sequential_execution=True)
         self.check_network_convergence(transformer, use_cuda=False, iter=5)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 26969bd523..634df65bb5 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -26,7 +26,8 @@ def max_pool2D_forward_naive(x,
                              strides,
                              paddings,
                              global_pool=0,
-                             ceil_mode=False):
+                             ceil_mode=False,
+                             exclusive=True):
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
@@ -54,7 +55,8 @@ def avg_pool2D_forward_naive(x,
                              strides,
                              paddings,
                              global_pool=0,
-                             ceil_mode=False):
+                             ceil_mode=False,
+                             exclusive=True):
     N, C, H, W = x.shape
     if global_pool == 1:
         ksize = [H, W]
@@ -73,8 +75,9 @@ def avg_pool2D_forward_naive(x,
             c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
-            out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / (
-                (r_end - r_start) * (c_end - c_start))
+            field_size = ((r_end - r_start) * (c_end - c_start)) if exclusive \
+                            else (ksize[0] * ksize[1])
+            out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
     return out
 
 
@@ -89,12 +92,13 @@ class TestPool2d_Op(OpTest):
         self.init_kernel_type()
         self.init_pool_type()
         self.init_ceil_mode()
+        self.init_exclusive()
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype(self.dtype)
-        output = self.pool2D_forward_naive(input, self.ksize, self.strides,
-                                           self.paddings, self.global_pool,
-                                           self.ceil_mode).astype(self.dtype)
+        output = self.pool2D_forward_naive(
+            input, self.ksize, self.strides, self.paddings, self.global_pool,
+            self.ceil_mode, self.exclusive).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -106,7 +110,9 @@ class TestPool2d_Op(OpTest):
             'use_cudnn': self.use_cudnn,
             'use_mkldnn': self.use_mkldnn,
             'ceil_mode': self.ceil_mode,
-            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+            'data_format':
+            'AnyLayout',  # TODO(dzhwinter) : should be fix latter
+            'exclusive': self.exclusive
         }
 
         self.outputs = {'Out': output}
@@ -150,6 +156,9 @@ class TestPool2d_Op(OpTest):
     def init_ceil_mode(self):
         self.ceil_mode = False
 
+    def init_exclusive(self):
+        self.exclusive = True
+
 
 class TestCase1(TestPool2d_Op):
     def init_test_case(self):
@@ -322,5 +331,15 @@ class TestCeilModeCase4(TestCase2):
         self.ceil_mode = True
 
 
+class TestAvgInclude(TestCase2):
+    def init_exclusive(self):
+        self.exclusive = False
+
+
+class TestCUDNNAvgInclude(TestCUDNNCase3):
+    def init_exclusive(self):
+        self.exclusive = False
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 77045c1307..f05f8ccb39 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -26,7 +26,8 @@ def max_pool3D_forward_naive(x,
                              strides,
                              paddings,
                              global_pool=0,
-                             ceil_mode=False):
+                             ceil_mode=False,
+                             exclusive=True):
     N, C, D, H, W = x.shape
     if global_pool == 1:
         ksize = [D, H, W]
@@ -60,7 +61,8 @@ def avg_pool3D_forward_naive(x,
                              strides,
                              paddings,
                              global_pool=0,
-                             ceil_mode=False):
+                             ceil_mode=False,
+                             exclusive=True):
     N, C, D, H, W = x.shape
     if global_pool == 1:
         ksize = [D, H, W]
@@ -85,8 +87,10 @@ def avg_pool3D_forward_naive(x,
                 w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                 x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
 
-                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3, 4)) / (
-                    (d_end - d_start) * (h_end - h_start) * (w_end - w_start))
+                field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
+                             if exclusive else ksize[0] * ksize[1] * ksize[2]
+                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3,
+                                                            4)) / field_size
     return out
 
 
@@ -100,13 +104,14 @@ class TestPool3d_Op(OpTest):
         self.init_kernel_type()
         self.init_pool_type()
         self.init_ceil_mode()
+        self.init_exclusive()
 
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
         input = np.random.random(self.shape).astype(self.dtype)
-        output = self.pool3D_forward_naive(input, self.ksize, self.strides,
-                                           self.paddings, self.global_pool,
-                                           self.ceil_mode).astype(self.dtype)
+        output = self.pool3D_forward_naive(
+            input, self.ksize, self.strides, self.paddings, self.global_pool,
+            self.ceil_mode, self.exclusive).astype(self.dtype)
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
@@ -117,7 +122,9 @@ class TestPool3d_Op(OpTest):
             'global_pooling': self.global_pool,
             'use_cudnn': self.use_cudnn,
             'ceil_mode': self.ceil_mode,
-            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+            'data_format':
+            'AnyLayout',  # TODO(dzhwinter) : should be fix latter
+            'exclusive': self.exclusive
         }
 
         self.outputs = {'Out': output}
@@ -161,6 +168,9 @@ class TestPool3d_Op(OpTest):
     def init_ceil_mode(self):
         self.ceil_mode = False
 
+    def init_exclusive(self):
+        self.exclusive = True
+
 
 class TestCase1(TestPool3d_Op):
     def init_test_case(self):
@@ -333,5 +343,15 @@ class TestCeilModeCase4(TestCase2):
         self.ceil_mode = True
 
 
+class TestAvgInclude(TestCase2):
+    def init_exclusive(self):
+        self.exclusive = False
+
+
+class TestCUDNNAvgInclude(TestCUDNNCase3):
+    def init_exclusive(self):
+        self.exclusive = False
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py b/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
new file mode 100644
index 0000000000..b913127ad6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+from threading import Thread
+
+
+def user_reader(inputs):
+    def _reader():
+        for d in inputs:
+            yield d
+
+    return _reader
+
+
+def batch_feeder(batch_reader, pin_memory=False, img_dtype="float32"):
+    def _feeder():
+        for batch_data in batch_reader():
+            sample_batch = []
+            label_batch = []
+            for sample, label in batch_data:
+                sample_batch.append(sample)
+                label_batch.append([label])
+            tensor = core.LoDTensor()
+            label = core.LoDTensor()
+            place = core.CUDAPinnedPlace() if pin_memory else core.CPUPlace()
+            tensor.set(np.array(sample_batch, dtype=img_dtype), place)
+            label.set(np.array(label_batch, dtype="int64"), place)
+            yield [tensor, label]
+
+    return _feeder
+
+
+class TestPyReader(unittest.TestCase):
+    def setUp(self):
+        self.capacity = 10
+        self.shapes = [(-1, 3, 2, 1), (-1, 1)]
+        self.lod_levels = [0, 0]
+        self.dtypes = ['float32', 'int64']
+
+    def test_pin_memory_pyreader(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            executor = fluid.Executor(place)
+
+            data_file = fluid.layers.py_reader(
+                capacity=self.capacity,
+                dtypes=self.dtypes,
+                lod_levels=self.lod_levels,
+                shapes=self.shapes)
+            # feed_queue = data_file.queue
+            read_out_data = fluid.layers.read_file(data_file)
+
+            self.inputs = []
+            for _ in range(10):
+                sample = np.random.uniform(
+                    low=0, high=1, size=[3, 2, 1]).astype("float32")
+                label = np.random.uniform(
+                    low=0, high=10, size=[1]).astype("int64")
+                self.inputs.append((sample, label))
+
+            self.input_tensors = []
+            for d, l in batch_feeder(
+                    paddle.batch(
+                        user_reader(self.inputs), batch_size=2),
+                    pin_memory=True
+                    if fluid.core.is_compiled_with_cuda() else False)():
+                ta = fluid.LoDTensorArray()
+                ta.append(d)
+                ta.append(l)
+                self.input_tensors.append(ta)
+
+            self.batched_inputs = []
+            for batch in paddle.batch(user_reader(self.inputs), batch_size=2)():
+                feed_d = []
+                feed_l = []
+                for d, l in batch:
+                    feed_d.append(d)
+                    feed_l.append([l])
+                self.batched_inputs.append([feed_d, feed_l])
+
+            data_file.decorate_tensor_provider(
+                batch_feeder(
+                    paddle.batch(
+                        user_reader(self.inputs), batch_size=2),
+                    pin_memory=True
+                    if fluid.core.is_compiled_with_cuda() else False))
+
+            executor.run(fluid.default_startup_program())
+            self.outputs = []
+
+            data_file.start()
+            for _ in self.input_tensors:
+                self.outputs.append(
+                    executor.run(fetch_list=list(read_out_data)))
+            data_file.reset()
+            self.validate()
+
+    def validate(self):
+        self.assertEqual(len(self.batched_inputs), len(self.outputs))
+        for in_data_list, out_data_list in zip(self.batched_inputs,
+                                               self.outputs):
+            self.assertEqual(len(in_data_list), len(out_data_list))
+            in_data_list_np = [
+                np.array(in_lod_tensor) for in_lod_tensor in in_data_list
+            ]
+            for in_data, out_data in zip(in_data_list_np, out_data_list):
+                self.assertTrue((in_data == out_data).all())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
index f63dbcd3d7..1a2c9bb5f4 100644
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -50,8 +50,10 @@ def rpn_target_assign(anchor_by_gt_overlap,
             fg_inds, size=(len(fg_inds) - num_fg), replace=False)
     else:
         disable_inds = fg_inds[num_fg:]
+
     labels[disable_inds] = -1
     fg_inds = np.where(labels == 1)[0]
+    bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32)
 
     num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
     bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
@@ -59,18 +61,27 @@ def rpn_target_assign(anchor_by_gt_overlap,
         enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
     else:
         enable_inds = bg_inds[:num_bg]
+
+    fg_fake_inds = np.array([], np.int32)
+    fg_value = np.array([fg_inds[0]], np.int32)
+    fake_num = 0
+    for bg_id in enable_inds:
+        if bg_id in fg_inds:
+            fake_num += 1
+            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
     labels[enable_inds] = 0
+
+    bbox_inside_weight[fake_num:, :] = 1
     fg_inds = np.where(labels == 1)[0]
     bg_inds = np.where(labels == 0)[0]
-
-    loc_index = fg_inds
-    score_index = np.hstack((fg_inds, bg_inds))
+    loc_index = np.hstack([fg_fake_inds, fg_inds])
+    score_index = np.hstack([fg_inds, bg_inds])
     labels = labels[score_index]
     assert not np.any(labels == -1), "Wrong labels with -1"
 
-    gt_inds = anchor_to_gt_argmax[fg_inds]
+    gt_inds = anchor_to_gt_argmax[loc_index]
 
-    return loc_index, score_index, labels, gt_inds
+    return loc_index, score_index, labels, gt_inds, bbox_inside_weight
 
 
 def get_anchor(n, c, h, w):
@@ -123,9 +134,12 @@ def rpn_target_assign_in_python(all_anchors,
         gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
         iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
 
-        loc_inds, score_inds, labels, gt_inds = rpn_target_assign(
-            iou, rpn_batch_size_per_im, rpn_positive_overlap,
-            rpn_negative_overlap, rpn_fg_fraction, use_random)
+        loc_inds, score_inds, labels, gt_inds, bbox_inside_weight = \
+                         rpn_target_assign(iou, rpn_batch_size_per_im,
+                                           rpn_positive_overlap,
+                                           rpn_negative_overlap,
+                                           rpn_fg_fraction,
+                                           use_random)
         # unmap to all anchor 
         loc_inds = inds_inside[loc_inds]
         score_inds = inds_inside[score_inds]
@@ -139,6 +153,7 @@ def rpn_target_assign_in_python(all_anchors,
             score_indexes = score_inds
             tgt_labels = labels
             tgt_bboxes = box_deltas
+            bbox_inside_weights = bbox_inside_weight
         else:
             loc_indexes = np.concatenate(
                 [loc_indexes, loc_inds + i * anchor_num])
@@ -146,8 +161,10 @@ def rpn_target_assign_in_python(all_anchors,
                 [score_indexes, score_inds + i * anchor_num])
             tgt_labels = np.concatenate([tgt_labels, labels])
             tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
+            bbox_inside_weights = np.vstack([bbox_inside_weights, \
+                                             bbox_inside_weight])
 
-    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
+    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels, bbox_inside_weights
 
 
 class TestRpnTargetAssignOp(OpTest):
@@ -182,10 +199,12 @@ class TestRpnTargetAssignOp(OpTest):
         rpn_fg_fraction = 0.5
         use_random = False
 
-        loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python(
-            all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh,
-            rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap,
-            rpn_fg_fraction, use_random)
+        loc_index, score_index, tgt_bbox, labels, bbox_inside_weights = \
+            rpn_target_assign_in_python(all_anchors, gt_boxes, is_crowd,
+                                   im_info, lod, rpn_straddle_thresh,
+                                   rpn_batch_size_per_im, rpn_positive_overlap,
+                                   rpn_negative_overlap,
+                                   rpn_fg_fraction, use_random)
         labels = labels[:, np.newaxis]
 
         self.op_type = "rpn_target_assign"
@@ -207,7 +226,8 @@ class TestRpnTargetAssignOp(OpTest):
             'LocationIndex': loc_index.astype('int32'),
             'ScoreIndex': score_index.astype('int32'),
             'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': labels.astype('int32')
+            'TargetLabel': labels.astype('int32'),
+            'BBoxInsideWeight': bbox_inside_weights.astype('float32')
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index 641eb03a5f..a80ad5b079 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
+class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
+    def compute(self, x, offset, out):
+        self.attrs = {'pooltype': "MAX", 'is_test': True}
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
+
+    def test_check_grad(self):
+        """Grad computation does not apply to Sequence MAX 
+            Pool executed when is_test is true """
+        return
+
+
 class TestSeqLastPool2D(TestSeqAvgPool2D):
     def compute(self, x, offset, out):
         self.attrs = {'pooltype': "LAST"}
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reverse.py b/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
new file mode 100644
index 0000000000..eebd25e097
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reverse.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from op_test import OpTest
+import numpy as np
+
+
+class TestSequenceReverseBase(OpTest):
+    def initParameters(self):
+        pass
+
+    def setUp(self):
+        self.size = (10, 3, 4)
+        self.lod = [2, 3, 5]
+        self.dtype = 'float32'
+        self.initParameters()
+        self.op_type = 'sequence_reverse'
+        self.x = np.random.random(self.size).astype(self.dtype)
+        self.y = self.get_output()
+
+        self.inputs = {'X': (self.x, [self.lod, ]), }
+        self.outputs = {'Y': (self.y, [self.lod, ]), }
+
+    def get_output(self):
+        tmp_x = np.reshape(self.x, newshape=[self.x.shape[0], -1])
+        tmp_y = np.ndarray(tmp_x.shape).astype(self.dtype)
+        prev_idx = 0
+        for cur_len in self.lod:
+            idx_range = range(prev_idx, prev_idx + cur_len)
+            tmp_y[idx_range, :] = np.flip(tmp_x[idx_range, :], 0)
+            prev_idx += cur_len
+
+        return np.reshape(tmp_y, newshape=self.x.shape).astype(self.dtype)
+
+    def test_output(self):
+        self.check_output(0)
+
+    def test_grad(self):
+        self.check_grad(['X'], 'Y')
+
+
+class TestSequenceReserve1(TestSequenceReverseBase):
+    def initParameters(self):
+        self.size = (12, 10)
+        self.lod = [4, 5, 3]
+
+
+class TestSequenceReverse2(TestSequenceReverseBase):
+    def initParameters(self):
+        self.size = (12, 10)
+        self.lod = [12]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py
index fab63b7d56..b16c744603 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
@@ -30,7 +30,6 @@ class TestSliceVar(unittest.TestCase):
             var = program.global_block().create_var(
                 name=str(random.randint(10000, 99999)),
                 persistable=True,
-                # dtype=core.VarDesc.VarType.LOD_TENSOR,
                 shape=shape)
             var_list.append(var)
         blocks = slice_variable(var_list, 10, min_size)
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index a18941dd31..37ee880970 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -26,7 +26,11 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
     Test softmax with cross entropy operator with discreate one-hot labels.
     """
 
+    def initParams(self):
+        self.numeric_stable_mode = False
+
     def setUp(self):
+        self.initParams()
         self.op_type = "softmax_with_cross_entropy"
         batch_size = 41
         class_num = 37
@@ -46,6 +50,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
             "Softmax": softmax.astype("float64"),
             "Loss": cross_entropy.astype("float64")
         }
+        self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
 
     def test_check_output(self):
         self.check_output()
@@ -54,6 +59,11 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         self.check_grad(["Logits"], "Loss")
 
 
+class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
+    def initParams(self):
+        self.numeric_stable_mode = True
+
+
 class TestSoftmaxWithCrossEntropyOp2(OpTest):
     """
     Test softmax with cross entropy operator with soft labels.
@@ -93,7 +103,11 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest):
     Test softmax with cross entropy operator with ignore_index.
     """
 
+    def initParams(self):
+        self.numeric_stable_mode = False
+
     def setUp(self):
+        self.initParams()
         self.op_type = "softmax_with_cross_entropy"
         batch_size = 41
         class_num = 37
@@ -114,7 +128,10 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest):
             "Softmax": softmax.astype("float64"),
             "Loss": cross_entropy.astype("float64")
         }
-        self.attrs = {"ignore_index": ignore_index}
+        self.attrs = {
+            "ignore_index": ignore_index,
+            "numeric_stable_mode": self.numeric_stable_mode
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -123,5 +140,10 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest):
         self.check_grad(["Logits"], "Loss")
 
 
+class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
+    def initParams(self):
+        self.numeric_stable_mode = True
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
index 4c3d025898..d674dad229 100644
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -25,18 +25,21 @@ from paddle.fluid.op import Operator
 class TestSplitIdsOp(OpTest):
     def setUp(self):
         self.op_type = "split_ids"
-        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        ids1 = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        ids2 = np.array([[6], [2], [3], [3], [5], [2], [6]]).astype('int64')
+        ids3 = np.array([[2], [2], [2], [3], [5], [5], [6]]).astype('int64')
+
         out0 = np.array([[0], [3], [6]]).astype('int64')
         out1 = np.array([[]]).astype('int64')
-        out2 = np.array([[2], [2], [5], [5]]).astype('int64')
-        self.inputs = {'Ids': ids}
+        out2 = np.array([[2], [5]]).astype('int64')
+        self.inputs = {'Ids': [('ids1', ids1), ('ids2', ids2), ('ids3', ids3)]}
         self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
 
     def test_check_output(self):
         self.check_output()
 
 
-class TestSpliteIds(unittest.TestCase):
+class TestSplitSelectedRows(unittest.TestCase):
     def get_places(self):
         places = [core.CPUPlace()]
         return places
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
index 41a5ee59ea..50204b8a77 100644
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -99,7 +99,6 @@ class TestSpliteSelectedRows(unittest.TestCase):
         out0_grad.set_height(height)
         out0_grad_tensor = out0_grad.get_tensor()
         np_array = np.ones((len(rows0), row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
         out0_grad_tensor.set(np_array, place)
 
         out1_grad = scope.var("out1@GRAD").get_selected_rows()
@@ -108,7 +107,6 @@ class TestSpliteSelectedRows(unittest.TestCase):
         out1_grad.set_height(height)
         out1_grad_tensor = out1_grad.get_tensor()
         np_array = np.ones((len(rows1), row_numel)).astype("float32")
-        np_array[0, 1] = 4.0
         out1_grad_tensor.set(np_array, place)
 
         x_grad = scope.var("X@GRAD").get_selected_rows()
@@ -121,11 +119,13 @@ class TestSpliteSelectedRows(unittest.TestCase):
 
         grad_op.run(scope, place)
 
-        self.assertEqual(x_grad.rows(), rows0 + rows1)
+        merged_rows = set(rows0 + rows1)
+        self.assertEqual(set(x_grad.rows()), set(rows0 + rows1))
         self.assertEqual(x_grad.height(), height)
 
+        print(np.array(x_grad.get_tensor()))
         self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0])
-        self.assertAlmostEqual(4.0, np.array(x_grad.get_tensor())[2, 1])
+        self.assertAlmostEqual(1.0, np.array(x_grad.get_tensor())[2, 1])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 74797bb656..e20418ff1c 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -45,16 +45,30 @@ class TestSumOp(OpTest):
 
 
 class TestSelectedRowsSumOp(OpTest):
-    def check_with_place(self, place):
-        scope = core.Scope()
-        self.check_input_and_optput(scope, place, True, True, True)
-        self.check_input_and_optput(scope, place, False, True, True)
-        self.check_input_and_optput(scope, place, False, False, True)
-        self.check_input_and_optput(scope, place, False, False, False)
+    def check_with_place(self, place, inplace):
+        self.height = 10
+        self.row_numel = 12
+        self.rows = [0, 1, 2, 3, 4, 5, 6]
+
+        self.check_input_and_optput(core.Scope(), place, inplace, True, True,
+                                    True)
+        self.check_input_and_optput(core.Scope(), place, inplace, False, True,
+                                    True)
+        self.check_input_and_optput(core.Scope(), place, inplace, False, False,
+                                    True)
+        self.check_input_and_optput(core.Scope(), place, inplace, False, False,
+                                    False)
+
+    def _get_array(self, row_num, row_numel):
+        array = np.ones((row_num, row_numel)).astype("float32")
+        for i in range(row_num):
+            array[i] *= i
+        return array
 
     def check_input_and_optput(self,
                                scope,
                                place,
+                               inplace,
                                w1_has_data=False,
                                w2_has_data=False,
                                w3_has_data=False):
@@ -64,35 +78,43 @@ class TestSelectedRowsSumOp(OpTest):
         self.create_selected_rows(scope, place, "W3", w3_has_data)
 
         # create Out Variable
-        out = scope.var('Out').get_selected_rows()
+        if inplace:
+            out_var_name = "W1"
+        else:
+            out_var_name = "Out"
+        out = scope.var(out_var_name).get_selected_rows()
 
         # create and run sum operator
-        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out='Out')
+        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name)
         sum_op.run(scope, place)
 
         has_data_w_num = 0
-        for w in [w1_has_data, w2_has_data, w3_has_data]:
-            if not w:
+        for has_data in [w1_has_data, w2_has_data, w3_has_data]:
+            if has_data:
                 has_data_w_num += 1
 
-        self.assertEqual(7 * has_data_w_num, len(out.rows()))
+        if has_data_w_num > 0:
+            self.assertEqual(len(out.rows()), 7)
+            self.assertTrue(
+                np.array_equal(
+                    np.array(out.get_tensor()),
+                    self._get_array(len(self.rows), self.row_numel) *
+                    has_data_w_num))
+        else:
+            self.assertEqual(len(out.rows()), 0)
 
-    def create_selected_rows(self, scope, place, var_name, isEmpty):
+    def create_selected_rows(self, scope, place, var_name, has_data):
         # create and initialize W Variable
-        if not isEmpty:
-            rows = [0, 1, 2, 3, 4, 5, 6]
-            row_numel = 12
+        if has_data:
+            rows = self.rows
         else:
             rows = []
-            row_numel = 12
 
         var = scope.var(var_name)
         w_selected_rows = var.get_selected_rows()
-        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_height(self.height)
         w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
-            w_array[i] *= i
+        w_array = self._get_array(len(rows), self.row_numel)
         w_tensor = w_selected_rows.get_tensor()
         w_tensor.set(w_array, place)
 
@@ -100,9 +122,11 @@ class TestSelectedRowsSumOp(OpTest):
 
     def test_w_is_selected_rows(self):
         places = [core.CPUPlace()]
-        # currently only support CPU
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
         for place in places:
-            self.check_with_place(place)
+            for inplace in [True, False]:
+                self.check_with_place(place, inplace)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_op.py b/python/paddle/fluid/tests/unittests/test_top_k_op.py
index e54e170f7f..69b29db83a 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -21,22 +21,27 @@ from op_test import OpTest
 
 class TestTopkOp(OpTest):
     def setUp(self):
+        self.set_args()
         self.op_type = "top_k"
-        k = 1
-        input = np.random.random((32, 84)).astype("float32")
-        output = np.ndarray((32, k))
-        indices = np.ndarray((32, k)).astype("int64")
+        k = self.top_k
+        input = np.random.random((self.row, k)).astype("float32")
+        output = np.ndarray((self.row, k))
+        indices = np.ndarray((self.row, k)).astype("int64")
 
         self.inputs = {'X': input}
         self.attrs = {'k': k}
 
-        for rowid in range(32):
+        for rowid in range(self.row):
             row = input[rowid]
-            output[rowid] = np.sort(row)[-k:]
-            indices[rowid] = row.argsort()[-k:]
+            output[rowid] = np.sort(row)[::-1][:k]
+            indices[rowid] = row.argsort()[::-1][:k]
 
         self.outputs = {'Out': output, 'Indices': indices}
 
+    def set_args(self):
+        self.row = 32
+        self.top_k = 1
+
     def test_check_output(self):
         self.check_output()
 
@@ -50,14 +55,39 @@ class TestTopkOp3d(OpTest):
         output = np.ndarray((64, k))
         indices = np.ndarray((64, k)).astype("int64")
 
-        # FIXME: should use 'X': input for a 3d input
-        self.inputs = {'X': input_flat_2d}
+        self.inputs = {'X': input}
         self.attrs = {'k': k}
 
         for rowid in range(64):
             row = input_flat_2d[rowid]
-            output[rowid] = np.sort(row)[-k:]
-            indices[rowid] = row.argsort()[-k:]
+            output[rowid] = np.sort(row)[::-1][:k]
+            indices[rowid] = row.argsort()[::-1][:k]
+
+        self.outputs = {
+            'Out': output.reshape((32, 2, k)),
+            'Indices': indices.reshape((32, 2, k))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTopkOp2(OpTest):
+    def setUp(self):
+        self.op_type = "top_k"
+        k = 1
+        m = 2056
+        input = np.random.random((m, 84)).astype("float32")
+        output = np.ndarray((m, k))
+        indices = np.ndarray((m, k)).astype("int64")
+
+        self.inputs = {'X': input}
+        self.attrs = {'k': k}
+
+        for rowid in range(m):
+            row = input[rowid]
+            output[rowid] = -np.sort(-row)[:k]
+            indices[rowid] = (-row).argsort()[:k]
 
         self.outputs = {'Out': output, 'Indices': indices}
 
@@ -65,5 +95,17 @@ class TestTopkOp3d(OpTest):
         self.check_output()
 
 
+class TestTopkOp3(TestTopkOp):
+    def set_args(self):
+        self.row = 2056
+        self.top_k = 3
+
+
+class TestTopkOp4(TestTopkOp):
+    def set_args(self):
+        self.row = 40000
+        self.top_k = 1
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 2192139f8d..9066fc9d1b 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -35,6 +35,7 @@ import sys
 import numpy as np
 import collections
 import six
+import logging
 
 from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
@@ -49,6 +50,7 @@ LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
 OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
 RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
 )
+OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
 DIST_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Dist
 LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
@@ -474,6 +476,26 @@ class DistributeTranspiler(object):
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         delete_ops(self.origin_program.global_block(), lr_ops)
 
+        # delete table init op
+        if self.has_distributed_lookup_table:
+            table_var = self.startup_program.global_block().vars[
+                self.table_name]
+            table_param_init_op = []
+            for op in self.startup_program.global_block().ops:
+                if self.table_name in op.output_arg_names:
+                    table_param_init_op.append(op)
+            init_op_num = len(table_param_init_op)
+            if init_op_num != 1:
+                raise ValueError("table init op num should be 1, now is " + str(
+                    init_op_num))
+            table_init_op = table_param_init_op[0]
+            self.startup_program.global_block().append_op(
+                type="fake_init",
+                inputs={},
+                outputs={"Out": table_var},
+                attrs={"shape": table_init_op.attr('shape')})
+            delete_ops(self.startup_program.global_block(), table_param_init_op)
+
         self.origin_program.__str__()
 
         if wait_port:
@@ -712,7 +734,7 @@ in a single call.")
                 for _, op in enumerate(self.optimize_ops):
                     # optimizer is connected to itself
                     if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \
-                        op not in global_ops:
+                            op not in global_ops:
                         log("append opt op: ", op.type, op.input_arg_names,
                             merged_var)
                         __append_optimize_op__(op, per_opt_block,
@@ -746,6 +768,15 @@ in a single call.")
             prefetch_var_name_to_block_id.extend(
                 lookup_table_var_name_to_block_id)
 
+        if len(optimize_blocks) == 0:
+            logging.warn("pserver [" + str(endpoint) +
+                         "] has no optimize block!!")
+            pre_block_idx = pserver_program.num_blocks - 1
+            empty_block = pserver_program._create_block(pre_block_idx)
+            optimize_blocks.append(empty_block)
+
+        # In some case, some parameter server will have no parameter to optimize
+        # So we give an empty optimize block to parameter server.
         attrs = {
             "optimize_blocks": optimize_blocks,
             "endpoint": endpoint,
@@ -889,11 +920,11 @@ to transpile() call.")
             block_idx = int(block_name.split(block_suffix)[1])
             orig_var = self.origin_program.global_block().vars[orig_var_name]
 
-            skip_numel = 0
+            skip_dim0 = 0
             slice_vars = self.param_var_mapping[orig_var_name]
             for slice_var in slice_vars[:block_idx]:
-                skip_numel += reduce(lambda x, y: x * y, slice_var.shape)
-            slice_vars_and_attrs.append([orig_var, skip_numel, param])
+                skip_dim0 += slice_var.shape[0]
+            slice_vars_and_attrs.append([orig_var, skip_dim0, param])
 
         return slice_vars_and_attrs
 
@@ -1033,90 +1064,87 @@ to transpile() call.")
     def _replace_lookup_table_op_with_prefetch(self, program,
                                                pserver_endpoints):
         # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
-        # self.all_prefetch_input_vars =
-        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
-        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_in_ids_vars = []
         self.all_prefetch_input_vars = []
-
-        # self.all_prefetch_input_vars =
-        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
-        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
         self.all_prefetch_output_vars = []
+        self.all_out_emb_vars = []
+        lookup_table_op_index = -1
 
         continue_search_lookup_table_op = True
         while continue_search_lookup_table_op:
             continue_search_lookup_table_op = False
             all_ops = program.global_block().ops
             for op in all_ops:
-                if op.type == LOOKUP_TABLE_TYPE:
+                if op.type == LOOKUP_TABLE_TYPE and self.table_name == op.input(
+                        "W")[0]:
+                    if not op.attr('is_distributed'):
+                        raise RuntimeError(
+                            "lookup_table_op that lookup an distributed embedding table"
+                            "should set is_distributed to true")
                     continue_search_lookup_table_op = True
 
-                    lookup_table_op_index = list(all_ops).index(op)
+                    lookup_table_op_index = lookup_table_op_index if lookup_table_op_index != -1 else list(
+                        all_ops).index(op)
                     ids_name = op.input("Ids")
                     out_name = op.output("Out")
 
                     ids_var = program.global_block().vars[ids_name[0]]
-                    prefetch_input_vars = self._create_splited_vars(
-                        source_var=ids_var,
-                        block=program.global_block(),
-                        tag="_prefetch_in_")
-                    self.all_prefetch_input_vars.append(prefetch_input_vars)
+                    self.all_in_ids_vars.append(ids_var)
 
                     out_var = program.global_block().vars[out_name[0]]
-                    prefetch_output_vars = self._create_splited_vars(
-                        source_var=out_var,
-                        block=program.global_block(),
-                        tag="_prefetch_out_")
-                    self.all_prefetch_output_vars.append(prefetch_output_vars)
-
-                    # insert split_ids_op
-                    program.global_block()._insert_op(
-                        index=lookup_table_op_index,
-                        type="split_ids",
-                        inputs={
-                            'Ids': [
-                                program.global_block().vars[varname]
-                                for varname in ids_name
-                            ]
-                        },
-                        outputs={"Out": prefetch_input_vars})
-
-                    # insert prefetch_op
-                    program.global_block()._insert_op(
-                        index=lookup_table_op_index + 1,
-                        type="prefetch",
-                        inputs={'X': prefetch_input_vars},
-                        outputs={"Out": prefetch_output_vars},
-                        attrs={
-                            "epmap": pserver_endpoints,
-                            # FIXME(qiao) temporarily disable this config because prefetch
-                            # is not act as other rpc op, it's more like a forward op
-                            # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
-                        })
-
-                    # insert concat_op
-                    program.global_block()._insert_op(
-                        index=lookup_table_op_index + 2,
-                        type="merge_ids",
-                        inputs={
-                            'Ids': [
-                                program.global_block().vars[varname]
-                                for varname in ids_name
-                            ],
-                            'X': prefetch_output_vars
-                        },
-                        outputs={
-                            "Out": [
-                                program.global_block().vars[varname]
-                                for varname in out_name
-                            ]
-                        })
+                    self.all_out_emb_vars.append(out_var)
 
                     # delete lookup_table_op
                     delete_ops(program.global_block(), [op])
                     # break for loop
                     break
 
+        for index in range(len(self.pserver_endpoints)):
+            in_var = program.global_block().create_var(
+                name=str("prefetch_compress_in_tmp_" + str(index)),
+                type=self.all_in_ids_vars[0].type,
+                shape=self.all_in_ids_vars[0].shape,
+                dtype=self.all_in_ids_vars[0].dtype)
+            self.all_prefetch_input_vars.append(in_var)
+
+            out_var = program.global_block().create_var(
+                name=str("prefetch_compress_out_tmp_" + str(index)),
+                type=self.all_out_emb_vars[0].type,
+                shape=self.all_out_emb_vars[0].shape,
+                dtype=self.all_out_emb_vars[0].dtype)
+            self.all_prefetch_output_vars.append(out_var)
+
+        # insert split_ids_op
+        program.global_block()._insert_op(
+            index=lookup_table_op_index,
+            type="split_ids",
+            inputs={'Ids': self.all_in_ids_vars},
+            outputs={"Out": self.all_prefetch_input_vars})
+
+        # insert prefetch_op
+        program.global_block()._insert_op(
+            index=lookup_table_op_index + 1,
+            type="prefetch",
+            inputs={'X': self.all_prefetch_input_vars},
+            outputs={"Out": self.all_prefetch_output_vars},
+            attrs={
+                "epmap": pserver_endpoints,
+                # FIXME(qiao) temporarily disable this config because prefetch
+                # is not act as other rpc op, it's more like a forward op
+                # RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+            })
+
+        # insert concat_op
+        program.global_block()._insert_op(
+            index=lookup_table_op_index + 2,
+            type="merge_ids",
+            inputs={
+                'Ids': self.all_in_ids_vars,
+                'Rows': self.all_prefetch_input_vars,
+                'X': self.all_prefetch_output_vars
+            },
+            outputs={"Out": self.all_out_emb_vars})
+
     def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
         # 2. add split_ids_op and send_op to send gradient to pservers
 
@@ -1133,7 +1161,8 @@ to transpile() call.")
                     inputs={
                         'Ids': [program.global_block().vars[table_grad_name]]
                     },
-                    outputs={"Out": self.trainer_side_table_grad_list})
+                    outputs={"Out": self.trainer_side_table_grad_list},
+                    attrs={RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE})
                 program.global_block()._insert_op(
                     index=op_index + 2,
                     type="send",
@@ -1159,32 +1188,31 @@ to transpile() call.")
         # STEP: create prefetch block
         table_var = pserver_program.global_block().vars[self.table_name]
         prefetch_var_name_to_block_id = []
-        for index in range(len(self.all_prefetch_input_vars)):
-            prefetch_block = pserver_program._create_block(optimize_block.idx)
-            trainer_ids = self.all_prefetch_input_vars[index][pserver_index]
-            pserver_ids = pserver_program.global_block().create_var(
-                name=trainer_ids.name,
-                type=trainer_ids.type,
-                shape=trainer_ids.shape,
-                dtype=trainer_ids.dtype)
-            trainer_out = self.all_prefetch_output_vars[index][pserver_index]
-            pserver_out = pserver_program.global_block().create_var(
-                name=trainer_out.name,
-                type=trainer_out.type,
-                shape=trainer_out.shape,
-                dtype=trainer_out.dtype)
-            prefetch_block.append_op(
-                type="lookup_sparse_table",
-                inputs={'Ids': pserver_ids,
-                        "W": table_var},
-                outputs={"Out": pserver_out},
-                attrs={
-                    "is_sparse": True,  # has no effect on lookup_table op
-                    "is_distributed": True,
-                    "padding_idx": -1
-                })
-            prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
-                prefetch_block.idx))
+        prefetch_block = pserver_program._create_block(optimize_block.idx)
+        trainer_ids = self.all_prefetch_input_vars[pserver_index]
+        pserver_ids = pserver_program.global_block().create_var(
+            name=trainer_ids.name,
+            type=trainer_ids.type,
+            shape=trainer_ids.shape,
+            dtype=trainer_ids.dtype)
+        trainer_out = self.all_prefetch_output_vars[pserver_index]
+        pserver_out = pserver_program.global_block().create_var(
+            name=trainer_out.name,
+            type=trainer_out.type,
+            shape=trainer_out.shape,
+            dtype=trainer_out.dtype)
+        prefetch_block.append_op(
+            type="lookup_sparse_table",
+            inputs={'Ids': pserver_ids,
+                    "W": table_var},
+            outputs={"Out": pserver_out},
+            attrs={
+                "is_sparse": True,  # has no effect on lookup_table op
+                "is_distributed": True,
+                "padding_idx": -1
+            })
+        prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
+            prefetch_block.idx))
         return prefetch_var_name_to_block_id
 
     def _create_table_optimize_block(self, pserver_index, pserver_program,
@@ -1262,7 +1290,6 @@ to transpile() call.")
         }
         outputs = {"ParamOut": [param_var]}
         # only support sgd now
-        import logging
         logging.warn(
             "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
             + table_opt_op.type)
@@ -1363,16 +1390,6 @@ to transpile() call.")
             program.global_block()._sync_with_cpp()
         return var_mapping
 
-    def _create_splited_vars(self, source_var, block, tag):
-        return [
-            block.create_var(
-                name=str(source_var.name + tag + str(index)),
-                type=source_var.type,
-                shape=source_var.shape,
-                dtype=source_var.dtype)
-            for index in range(len(self.pserver_endpoints))
-        ]
-
     def _clone_var(self, block, var, persistable=True):
         return block.create_var(
             name=var.name,
@@ -1430,7 +1447,7 @@ to transpile() call.")
         elif op_type == "adamax":
             if varkey in ["Moment", "InfNorm"]:
                 return param_shape
-        elif op_type == "momentum":
+        elif op_type in ["momentum", "lars_momentum"]:
             if varkey == "Velocity":
                 return param_shape
         elif op_type == "rmsprop":
@@ -1439,8 +1456,15 @@ to transpile() call.")
         elif op_type == "decayed_adagrad":
             if varkey == "Moment":
                 return param_shape
+        elif op_type == "ftrl":
+            if varkey in ["SquaredAccumulator", "LinearAccumulator"]:
+                return param_shape
         elif op_type == "sgd":
             pass
+        else:
+            raise ValueError(
+                "Not supported optimizer for distributed training: %s" %
+                op_type)
         return orig_shape
 
     def _get_varname_parts(self, varname):
@@ -1717,8 +1741,10 @@ to transpile() call.")
         lr_ops = []
         block = self.origin_program.global_block()
         for op in block.ops:
-            if int(op.attr(RPC_OP_ROLE_ATTR_NAME)) == int(
-                    LR_SCHED_OP_ROLE_ATTR_VALUE):
+            role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME))
+            if role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) or \
+                role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) | \
+                    int(OPT_OP_ROLE_ATTR_VALUE):
                 lr_ops.append(op)
                 log("append lr op: ", op.type)
         return lr_ops
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index 5269bd94ce..9a13cecc64 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -61,6 +61,9 @@ class InferenceTranspiler(object):
             raise TypeError("scope should be as Scope type or None")
         use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
 
+        if use_mkldnn:
+            self._depthwise_conv_mkldnn(program)
+
         self._fuse_batch_norm(program, place, scope)
         if use_mkldnn:
             self._fuse_conv_bias_mkldnn(program)
@@ -70,6 +73,31 @@ class InferenceTranspiler(object):
                 program)  # ResNet residual block merging
             self._fuse_bn_relu_mkldnn(program)
 
+    def _depthwise_conv_mkldnn(self, program):
+        '''
+        Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.
+        The result is:
+            - before:
+                - any_other_op->depthwise_conv->any_other_op
+            - after:
+                - any_other_op->conv->any_other_op
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.type == 'depthwise_conv2d':
+                current_op.desc.set_type("conv2d")
+            i = i + 1
+
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
     def _fuse_conv_eltwise_mkldnn(self, program):
         '''
         Transpile the program fusing elementwise_add into conv for MKLDNN
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 861bb5fae5..c9f1be9347 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -171,7 +171,7 @@ class ControlFlowGraph(object):
                 self._live_out[i] |= self._live_in[s]
             self._live_in[i] = self._uses[i] | (
                 self._live_out[i] - self._defs[i])
-            if live_in[i] != self._live_in[i]:
+            if live_in[i] != set(self._live_in[i]):
                 for d in self._presuccessors[i]:
                     worklist.append(d)
 
@@ -321,8 +321,7 @@ class ControlFlowGraph(object):
 
                         if not compare_shape(x_shape, cache_shape, level):
                             continue
-                        # TODO(qijun): actually, we should compare
-                        # dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
+                        # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
                         if x_dtype != cache_dtype:
                             continue
 
@@ -487,7 +486,6 @@ def memory_optimize(input_program,
             skip_opt_set = grad_set
         else:
             skip_opt_set.update(grad_set)
-
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
         cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 5de6f966a0..db6fe2d5ff 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from plot import Ploter
+from .plot import Ploter
 __all__ = ['dump_config', 'Ploter']
diff --git a/python/setup.py.in b/python/setup.py.in
index b376be0ea3..ee19294ad5 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -27,7 +27,7 @@ def _get_version_detail(idx):
     if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
         version_details = '@PADDLE_VERSION@'.split('.')
 
-        if len(version_details) == 3:
+        if len(version_details) >= 3:
             return version_details[idx]
 
     return 0