diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8dcf9786e3..efa68c9ba2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -214,6 +214,7 @@ if (NOT WIN32)
 # there is no official support of warpctc, nccl, cupti in windows
 include(external/warpctc)   # download, build, install warpctc
 include(cupti)
+include(external/gzstream)
 endif (NOT WIN32)
 
 if(WITH_DISTRIBUTE)
diff --git a/Dockerfile b/Dockerfile
index 9459552890..84e1edbee9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -43,6 +43,8 @@ RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
     CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null
 
+RUN rm -r /root/python_build
+
 RUN apt-get update && \
     apt-get install -y --allow-downgrades patchelf \
     python3 python3-dev python3-pip \
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 964d5fd45b..414e92eb27 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -199,8 +199,11 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
 else(NOT WIN32)
+list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
+  list(APPEND CUDA_NVCC_FLAGS  "-g -G")
+  # match the cl's _ITERATOR_DEBUG_LEVEL
+  list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
 elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
   list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
 else()
diff --git a/cmake/external/gzstream.cmake b/cmake/external/gzstream.cmake
new file mode 100644
index 0000000000..3e36ef7ae2
--- /dev/null
+++ b/cmake/external/gzstream.cmake
@@ -0,0 +1,48 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: gzstream is needed when linking with ctr reader.
+
+SET(GZSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/gzstream)
+SET(GZSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gzstream)
+SET(GZSTREAM_INCLUDE_DIR "${GZSTREAM_INSTALL_DIR}/include/" CACHE PATH "gzstream include directory." FORCE)
+
+ExternalProject_Add(
+        extern_gzstream
+        DEPENDS zlib
+        GIT_REPOSITORY "https://github.com/jacquesqiao/gzstream.git"
+        GIT_TAG ""
+        PREFIX          ${GZSTREAM_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_IN_SOURCE 1
+        BUILD_COMMAND   make EXTERN_CPPFLAGS="-I${THIRD_PARTY_PATH}/install/zlib/include" EXTERM_LDFLAGS="-L${THIRD_PARTY_PATH}/install/zlib/lib" -j8
+        INSTALL_COMMAND mkdir -p ${GZSTREAM_INSTALL_DIR}/lib/ && mkdir -p ${GZSTREAM_INSTALL_DIR}/include/
+        && cp ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/libgzstream.a ${GZSTREAM_INSTALL_DIR}/lib
+        && cp -r ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/gzstream.h ${GZSTREAM_INSTALL_DIR}/include
+)
+
+ADD_LIBRARY(gzstream STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gzstream PROPERTY IMPORTED_LOCATION
+        "${GZSTREAM_INSTALL_DIR}/lib/libgzstream.a")
+
+include_directories(${GZSTREAM_INCLUDE_DIR})
+ADD_DEPENDENCIES(gzstream extern_gzstream zlib)
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index 2e335579f3..e66459fa3a 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
     return()
 ENDIF()
 
+INCLUDE(GNUInstallDirs)
+
 INCLUDE(ExternalProject)
 
 SET(NGRAPH_PROJECT         "extern_ngraph")
@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
+SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
+SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
+SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
+SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
 
 ExternalProject_Add(
     ${NGRAPH_PROJECT}
@@ -63,18 +69,6 @@ ExternalProject_Add(
     CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
 )
 
-if(UNIX AND NOT APPLE)
-    include(GNUInstallDirs)
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-else()
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
-endif()
-MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
-
-SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
-SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
-SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
-
 # Workaround for nGraph expecting mklml to be in mkldnn install directory.
 ExternalProject_Add_Step(
     ${NGRAPH_PROJECT}
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index c885877a2b..3a10ea945d 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -26,7 +26,7 @@ ExternalProject_Add(
         extern_pybind
         ${EXTERNAL_PROJECT_LOG_ARGS}
         GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
-        GIT_TAG         "v2.1.1"
+        GIT_TAG         "v2.2.4"
         PREFIX          ${PYBIND_SOURCE_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 7d803d00ef..312fbaa0b3 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -349,10 +349,17 @@ function(cc_test TARGET_NAME)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    if(WIN32)
+      list(APPEND win32_deps shlwapi)
+      if("${cc_test_DEPS};" MATCHES "python;")
+        list(REMOVE_ITEM cc_test_DEPS python)
+        list(APPEND win32_deps ${PYTHON_LIBRARIES})
+      endif()
+    endif(WIN32)
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
     target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     if(WIN32)
-      target_link_libraries(${TARGET_NAME} shlwapi)
+      target_link_libraries(${TARGET_NAME} ${win32_deps})
     endif(WIN32)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
@@ -683,7 +690,7 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+             COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
              FLAGS_cpu_deterministic=true
              PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 7355b67ab1..c679d8507d 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -129,6 +129,15 @@ if (WITH_MKLDNN)
             )
 endif ()
 
+if (WITH_NGRAPH)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
+    copy(ngraph_lib
+            SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS ngraph
+            )
+endif ()
+
 if (NOT WIN32)
     if (NOT MOBILE_INFERENCE AND NOT RPI)
         set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
@@ -186,8 +195,7 @@ set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
        ${src_dir}/${module}/api/paddle_*.h
-       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
         )
 
 set(module "platform")
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 8397ae093b..14996e3d86 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -26,12 +26,19 @@ paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], vara
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
 paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
-paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
-paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
-paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ReduceStrategy, arg0: int) -> None
-paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.BuildStrategy) -> None
+paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
+paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
 paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
@@ -69,7 +76,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
-paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
+paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -97,8 +104,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
 paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
-paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0))
-paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
+paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
 paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
@@ -175,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -187,6 +194,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -291,6 +299,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 0975497a17..c701a2ad63 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -34,6 +34,7 @@ add_subdirectory(ir)
 add_subdirectory(details)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
+proto_library(async_executor_proto SRCS data_feed.proto)
 
 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
@@ -116,14 +117,9 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 
-if (NOT WIN32)
 cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler transfer_scope_cache)
-else()
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
-endif(NOT WIN32)
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 
@@ -131,8 +127,9 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
+
 if(NOT WIN32)
+cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
 cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
   shape_inference data_transform lod_tensor profiler)
 endif(NOT WIN32)
@@ -140,7 +137,7 @@ endif(NOT WIN32)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
-py_proto_compile(framework_py_proto SRCS framework.proto)
+py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
@@ -162,18 +159,19 @@ endif(NOT WIN32)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 
-cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
   if(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
   else(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
   endif(NOT WIN32)
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
@@ -181,8 +179,11 @@ endif()
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
         graph build_strategy
-        fast_threaded_ssa_graph_executor)
+        fast_threaded_ssa_graph_executor variable_helper)
+
+cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
 
+cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
new file mode 100644
index 0000000000..afb2dd2f06
--- /dev/null
+++ b/paddle/fluid/framework/async_executor.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/async_executor.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace framework {
+AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place)
+    : root_scope_(scope), place_(place) {}
+
+void AsyncExecutor::CreateThreads(
+    ExecutorThreadWorker* worker, const ProgramDesc& main_program,
+    const std::shared_ptr<DataFeed>& reader,
+    const std::vector<std::string>& fetch_var_names, Scope* root_scope,
+    const int thread_index, const bool debug) {
+  worker->SetThreadId(thread_index);
+  worker->SetDebug(debug);
+  worker->SetRootScope(root_scope);
+  worker->CreateThreadResource(main_program, place_);
+  worker->SetDataFeed(reader);
+  worker->SetFetchVarNames(fetch_var_names);
+  worker->BindingDataFeedMemory();
+}
+
+void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
+                    const int thread_num, const DataFeedDesc& data_feed_desc,
+                    const std::vector<std::string>& filelist) {
+  readers.resize(thread_num);
+  for (size_t i = 0; i < readers.size(); ++i) {
+    readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name());
+    readers[i]->Init(data_feed_desc);  // set batch_size and queue_size here
+  }
+  readers[0]->SetFileList(filelist);
+}
+
+void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
+                                const std::string& data_feed_desc_str,
+                                const std::vector<std::string>& filelist,
+                                const int thread_num,
+                                const std::vector<std::string>& fetch_var_names,
+                                const bool debug) {
+  std::vector<std::thread> threads;
+
+  auto& block = main_program.Block(0);
+  for (auto var_name : fetch_var_names) {
+    auto var_desc = block.FindVar(var_name);
+    auto shapes = var_desc->GetShape();
+    PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
+                   "var %s: Fetched var has wrong shape, "
+                   "only variables with the last dimension size 1 supported",
+                   var_name);
+  }
+
+  DataFeedDesc data_feed_desc;
+  google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
+                                                &data_feed_desc);
+
+  int actual_thread_num = thread_num;
+  int file_cnt = filelist.size();
+  PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
+
+  if (actual_thread_num > file_cnt) {
+    VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt
+            << ". Changing thread_num = " << file_cnt;
+    actual_thread_num = file_cnt;
+  }
+
+  /*
+    readerDesc: protobuf description for reader initlization
+    argument: class_name, batch_size, use_slot, queue_size, buffer_size,
+    padding_index
+
+    reader:
+    1) each thread has a reader, reader will read input data and
+    put it into input queue
+    2) each reader has a Next() iterface, that can fetch an instance
+    from the input queue
+   */
+  // todo: should be factory method for creating datafeed
+  std::vector<std::shared_ptr<DataFeed>> readers;
+  PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
+
+  std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
+  workers.resize(actual_thread_num);
+  for (auto& worker : workers) {
+    worker.reset(new ExecutorThreadWorker);
+  }
+
+  // prepare thread resource here
+  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    CreateThreads(workers[thidx].get(), main_program, readers[thidx],
+                  fetch_var_names, root_scope_, thidx, debug);
+  }
+
+  // start executing ops in multiple threads
+  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    threads.push_back(
+        std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  root_scope_->DropKids();
+
+  return;
+}
+
+}  // einit_modelnd namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h
new file mode 100644
index 0000000000..f4d2a79ac5
--- /dev/null
+++ b/paddle/fluid/framework/async_executor.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeinfo>
+#include <vector>
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+class AsyncExecutor {
+ public:
+  AsyncExecutor(Scope* scope, const platform::Place& place);
+  virtual ~AsyncExecutor() {}
+  void RunFromFile(const ProgramDesc& main_program,
+                   const std::string& data_feed_desc_str,
+                   const std::vector<std::string>& filelist,
+                   const int thread_num,
+                   const std::vector<std::string>& fetch_names,
+                   const bool debug = false);
+
+ private:
+  void CreateThreads(ExecutorThreadWorker* worker,
+                     const ProgramDesc& main_program,
+                     const std::shared_ptr<DataFeed>& reader,
+                     const std::vector<std::string>& fetch_var_names,
+                     Scope* root_scope, const int thread_index,
+                     const bool debug);
+
+ public:
+  Scope* root_scope_;
+  platform::Place place_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 57ff061fe5..fee6ba4004 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -18,8 +18,8 @@ namespace framework {
 
 void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
                      Tensor *out) {
-  VLOG(30) << "DeviceTransform in, src_place " << in.place()
-           << " dst_place: " << dst_place;
+  VLOG(3) << "DeviceTransform in, src_place " << in.place()
+          << " dst_place: " << dst_place;
 
   PADDLE_ENFORCE_NE(
       in.place().which(), dst_place.which(),
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index 2d2323edc3..c9ec5e7a7b 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -49,10 +49,10 @@ class TestOpWithKernel : public OperatorWithKernel {
   OpKernelType GetExpectedKernelType(
       const ExecutionContext& ctx) const override {
     if (Attr<bool>("use_gpu")) {
-      VLOG(30) << "force use gpu kernel";
+      VLOG(3) << "force use gpu kernel";
       return OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0));
     } else {
-      VLOG(30) << "use default kernel";
+      VLOG(3) << "use default kernel";
       return OpKernelType(proto::VarType::FP32,
                           ctx.Input<Tensor>("input")->place());
     }
@@ -148,7 +148,7 @@ TEST(Operator, CPUtoGPU) {
   // get output
   auto* output2 = scope.Var("OUT2");
   gpu_op->Run(scope, cuda_place);
-  VLOG(30) << "after gpu_op run";
+  VLOG(3) << "after gpu_op run";
 
   // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
   paddle::platform::DeviceContextPool& pool =
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
new file mode 100644
index 0000000000..291d8ffc3c
--- /dev/null
+++ b/paddle/fluid/framework/data_feed.cc
@@ -0,0 +1,386 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<std::string> DataFeed::filelist_;
+size_t DataFeed::file_idx_;
+std::mutex DataFeed::mutex_for_pick_file_;
+bool DataFeed::finish_set_filelist_;
+
+void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
+  CheckInit();
+  for (size_t i = 0; i < use_slots_.size(); ++i) {
+    if (name == use_slots_[i]) {
+      if (use_slots_is_dense_[i]) {
+        feed_vec_[i] = MixTensor(var->GetMutable<Tensor>());
+      } else {
+        feed_vec_[i] = MixTensor(var->GetMutable<LoDTensor>());
+      }
+    }
+  }
+}
+
+bool DataFeed::SetFileList(const std::vector<std::string>& files) {
+  std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
+  CheckInit();
+  if (finish_set_filelist_) {
+    VLOG(3) << "info: you have set the filelist.";
+    return false;
+  }
+  PADDLE_ENFORCE(files.size(), "You have set an empty filelist.");
+  filelist_.assign(files.begin(), files.end());
+  file_idx_ = 0;
+
+  finish_set_filelist_ = true;
+  return true;
+}
+
+void DataFeed::SetBatchSize(int batch_size) {
+  PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size);
+  default_batch_size_ = batch_size;
+}
+
+bool DataFeed::PickOneFile(std::string* filename) {
+  std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
+  if (file_idx_ == filelist_.size()) {
+    return false;
+  }
+  *filename = filelist_[file_idx_++];
+  return true;
+}
+
+void DataFeed::CheckInit() {
+  PADDLE_ENFORCE(finish_init_, "Initialization did not succeed.");
+}
+
+void DataFeed::CheckSetFileList() {
+  PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed.");
+}
+
+void DataFeed::CheckStart() {
+  PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet.");
+}
+
+template <typename T>
+void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
+  PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
+  queue_size_ = queue_size;
+  queue_ = std::unique_ptr<paddle::operators::reader::BlockingQueue<T>>(
+      new paddle::operators::reader::BlockingQueue<T>(queue_size_));
+}
+
+template <typename T>
+bool PrivateQueueDataFeed<T>::Start() {
+  CheckSetFileList();
+  read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
+  read_thread_.detach();
+
+  finish_start_ = true;
+  return true;
+}
+
+template <typename T>
+void PrivateQueueDataFeed<T>::ReadThread() {
+  std::string filename;
+  while (PickOneFile(&filename)) {
+    file_.open(filename.c_str());  // is_text_feed
+    PADDLE_ENFORCE(file_.good(), "Open file<%s> fail.", filename.c_str());
+    T instance;
+    while (ParseOneInstance(&instance)) {
+      queue_->Send(instance);
+    }
+    file_.close();
+  }
+  queue_->Close();
+}
+
+template <typename T>
+int PrivateQueueDataFeed<T>::Next() {
+  CheckStart();
+  int index = 0;
+  T instance;
+  T ins_vec;
+  while (index < default_batch_size_) {
+    if (!queue_->Receive(&instance)) {
+      break;
+    }
+    AddInstanceToInsVec(&ins_vec, instance, index++);
+  }
+  batch_size_ = index;
+  if (batch_size_ != 0) {
+    PutToFeedVec(ins_vec);
+  }
+  return batch_size_;
+}
+
+#ifdef _WIN32
+template class PrivateQueueDataFeed<std::vector<MultiSlotType>>;
+#endif
+
+void MultiSlotDataFeed::Init(
+    const paddle::framework::DataFeedDesc& data_feed_desc) {
+  finish_init_ = false;
+  finish_set_filelist_ = false;
+  finish_start_ = false;
+
+  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
+                 "Multi_slot_desc has not been set.");
+  paddle::framework::MultiSlotDesc multi_slot_desc =
+      data_feed_desc.multi_slot_desc();
+  SetBatchSize(data_feed_desc.batch_size());
+  SetQueueSize(data_feed_desc.batch_size());
+  size_t all_slot_num = multi_slot_desc.slots_size();
+  all_slots_.resize(all_slot_num);
+  all_slots_type_.resize(all_slot_num);
+  use_slots_index_.resize(all_slot_num);
+  use_slots_.clear();
+  use_slots_is_dense_.clear();
+  for (size_t i = 0; i < all_slot_num; ++i) {
+    const auto& slot = multi_slot_desc.slots(i);
+    all_slots_[i] = slot.name();
+    all_slots_type_[i] = slot.type();
+    use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
+    if (slot.is_used()) {
+      use_slots_.push_back(all_slots_[i]);
+      use_slots_is_dense_.push_back(slot.is_dense());
+    }
+  }
+  feed_vec_.resize(use_slots_.size());
+  finish_init_ = true;
+}
+
+bool MultiSlotDataFeed::CheckFile(const char* filename) {
+  CheckInit();  // get info of slots
+  std::ifstream fin(filename);
+  if (!fin.good()) {
+    VLOG(1) << "error: open file<" << filename << "> fail";
+    return false;
+  }
+  std::string line;
+  int instance_cout = 0;
+  std::string all_slots_alias = "";
+  for (const auto& alias : all_slots_) {
+    all_slots_alias += alias + " ";
+  }
+  std::string use_slots_alias = "";
+  for (const auto& alias : use_slots_) {
+    use_slots_alias += alias + " ";
+  }
+  VLOG(3) << "total slots num: " << all_slots_.size();
+  VLOG(3) << "total slots alias: " << all_slots_alias;
+  VLOG(3) << "used slots num: " << use_slots_.size();
+  VLOG(3) << "used slots alias: " << use_slots_alias;
+  while (getline(fin, line)) {
+    ++instance_cout;
+    const char* str = line.c_str();
+    char* endptr = const_cast<char*>(str);
+    int len = line.length();
+    for (size_t i = 0; i < all_slots_.size(); ++i) {
+      int num = strtol(endptr, &endptr, 10);
+      if (num < 0) {
+        VLOG(0) << "error: the number of ids is a negative number: " << num;
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      } else if (num == 0) {
+        VLOG(0)
+            << "error: the number of ids can not be zero, you need "
+               "padding it in data generator; or if there is something wrong"
+               " with the data, please check if the data contains unresolvable "
+               "characters.";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      } else if (errno == ERANGE || num > INT_MAX) {
+        VLOG(0) << "error: the number of ids greater than INT_MAX";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      }
+      if (all_slots_type_[i] == "float") {
+        for (int i = 0; i < num; ++i) {
+          strtof(endptr, &endptr);
+          if (errno == ERANGE) {
+            VLOG(0) << "error: the value is out of the range of "
+                       "representable values for float";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+          if (i + 1 != num && endptr - str == len) {
+            VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+        }
+      } else if (all_slots_type_[i] == "uint64") {
+        for (int i = 0; i < num; ++i) {
+          strtoull(endptr, &endptr, 10);
+          if (errno == ERANGE) {
+            VLOG(0) << "error: the value is out of the range of "
+                       "representable values for uint64_t";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+          if (i + 1 != num && endptr - str == len) {
+            VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+        }
+      } else {
+        VLOG(0) << "error: this type<" << all_slots_type_[i]
+                << "> is not supported";
+        return false;
+      }
+    }
+    // It may be added '\t' character to the end of the output of reduce
+    // task when processes data by Hadoop(when the output of the reduce
+    // task of Hadoop has only one field, it will add a '\t' at the end
+    // of the line by default, and you can use this option to avoid it:
+    // `-D mapred.textoutputformat.ignoreseparator=true`), which does
+    // not affect the correctness of the data. Therefore, it should be
+    // judged that the data is not normal when the end of each line of
+    // data contains characters which are not spaces.
+    while (endptr - str != len) {
+      if (!isspace(*(endptr++))) {
+        VLOG(0)
+            << "error: there is some extra characters at the end of the line.";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      }
+    }
+  }
+  VLOG(3) << "instances cout: " << instance_cout;
+  VLOG(3) << "The file format is correct";
+  return true;
+}
+
+bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
+  std::string line;
+  if (getline(file_, line)) {
+    int use_slots_num = use_slots_.size();
+    instance->resize(use_slots_num);
+    // parse line
+    const char* str = line.c_str();
+    char* endptr = const_cast<char*>(str);
+    int pos = 0;
+    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
+      int idx = use_slots_index_[i];
+      int num = strtol(&str[pos], &endptr, 10);
+      PADDLE_ENFORCE(
+          num,
+          "The number of ids can not be zero, you need padding "
+          "it in data generator; or if there is something wrong with "
+          "the data, please check if the data contains unresolvable "
+          "characters.\nplease check this error line: %s",
+          str);
+      if (idx != -1) {
+        (*instance)[idx].Init(all_slots_type_[i]);
+        if ((*instance)[idx].GetType()[0] == 'f') {  // float
+          for (int j = 0; j < num; ++j) {
+            float feasign = strtof(endptr, &endptr);
+            (*instance)[idx].AddValue(feasign);
+          }
+        } else if ((*instance)[idx].GetType()[0] == 'u') {  // uint64
+          for (int j = 0; j < num; ++j) {
+            uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
+            (*instance)[idx].AddValue(feasign);
+          }
+        }
+        pos = endptr - str;
+      } else {
+        for (int j = 0; j <= num; ++j) {
+          pos = line.find_first_of(' ', pos + 1);
+        }
+      }
+    }
+  } else {
+    return false;
+  }
+  return true;
+}
+
+void MultiSlotDataFeed::AddInstanceToInsVec(
+    std::vector<MultiSlotType>* ins_vec,
+    const std::vector<MultiSlotType>& instance, int index) {
+  if (index == 0) {
+    ins_vec->resize(instance.size());
+    for (size_t i = 0; i < instance.size(); ++i) {
+      (*ins_vec)[i].Init(instance[i].GetType());
+      (*ins_vec)[i].InitOffset();
+    }
+  }
+  for (size_t i = 0; i < instance.size(); ++i) {
+    (*ins_vec)[i].AddIns(instance[i]);
+  }
+}
+
+void MultiSlotDataFeed::PutToFeedVec(
+    const std::vector<MultiSlotType>& ins_vec) {
+  for (size_t i = 0; i < use_slots_.size(); ++i) {
+    const auto& type = ins_vec[i].GetType();
+    const auto& offset = ins_vec[i].GetOffset();
+    int total_instance = static_cast<int>(offset.back());
+    if (type[0] == 'f') {  // float
+      const auto& feasign = ins_vec[i].GetFloatData();
+      if (feed_vec_[i].IsDense()) {
+        int size_in_each_batch = total_instance / batch_size_;
+        float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<float>(
+            {batch_size_, size_in_each_batch}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+      } else {
+        float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data<float>(
+            {total_instance, 1}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+        LoD data_lod{offset};
+        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
+      }
+    } else if (type[0] == 'u') {  // uint64
+      // no uint64_t type in paddlepaddle
+      const auto& feasign = ins_vec[i].GetUint64Data();
+      if (feed_vec_[i].IsDense()) {
+        int size_in_each_batch = total_instance / batch_size_;
+        int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<int64_t>(
+            {batch_size_, size_in_each_batch}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+      } else {
+        int64_t* tensor_ptr =
+            feed_vec_[i].GetLoDTensor()->mutable_data<int64_t>(
+                {total_instance, 1}, platform::CPUPlace());
+        memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+        LoD data_lod{offset};
+        feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
+      }
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
new file mode 100644
index 0000000000..a7f8d1d317
--- /dev/null
+++ b/paddle/fluid/framework/data_feed.h
@@ -0,0 +1,269 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+
+namespace paddle {
+namespace framework {
+
+// Pack Tensor type and LoDTensor type into MixTensor type, in order
+// to record either Tensor or LoDTensor information at the same time.
+class MixTensor {
+ public:
+  MixTensor() {}
+  explicit MixTensor(LoDTensor* lodtensor) {
+    is_dense_ = false;
+    lodtensor_ = lodtensor;
+  }
+  explicit MixTensor(Tensor* tensor) {
+    is_dense_ = true;
+    tensor_ = tensor;
+  }
+  bool IsDense() { return is_dense_; }
+  LoDTensor* GetLoDTensor() {
+    PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr.");
+    return lodtensor_;
+  }
+  Tensor* GetTensor() {
+    PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr.");
+    return tensor_;
+  }
+
+ private:
+  bool is_dense_;
+  LoDTensor* lodtensor_;
+  Tensor* tensor_;
+};
+
+// DataFeed is the base virtual class for all ohther DataFeeds.
+// It is used to read files and parse the data for subsequent trainer.
+// Example:
+//   DataFeed* reader =
+//   paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name);
+//   reader->Init(data_feed_desc); // data_feed_desc is a protobuf object
+//   reader->SetFileList(filelist);
+//   const std::vector<std::string> & use_slot_alias =
+//   reader->GetUseSlotAlias();
+//   for (auto name: use_slot_alias){ // for binding memory
+//     reader->AddFeedVar(scope->Var(name), name);
+//   }
+//   reader->Start();
+//   while (reader->Next()) {
+//      // trainer do something
+//   }
+class DataFeed {
+ public:
+  DataFeed() {}
+  virtual ~DataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
+  virtual bool CheckFile(const char* filename) {
+    PADDLE_THROW("This function(CheckFile) is not implemented.");
+  }
+  // Set filelist for DataFeed.
+  // Pay attention that it must init all readers before call this function.
+  // Otherwise, Init() function will init finish_set_filelist_ flag.
+  virtual bool SetFileList(const std::vector<std::string>& files);
+  virtual bool Start() = 0;
+  // The trainer calls the Next() function, and the DataFeed will load a new
+  // batch to the feed_vec. The return value of this function is the batch
+  // size of the current batch.
+  virtual int Next() = 0;
+  // Get all slots' alias which defined in protofile
+  virtual const std::vector<std::string>& GetAllSlotAlias() {
+    return all_slots_;
+  }
+  // Get used slots' alias which defined in protofile
+  virtual const std::vector<std::string>& GetUseSlotAlias() {
+    return use_slots_;
+  }
+  // This function is used for binding feed_vec memory
+  virtual void AddFeedVar(Variable* var, const std::string& name);
+
+ protected:
+  // The following three functions are used to check if it is executed in this
+  // order:
+  //   Init() -> SetFileList() -> Start() -> Next()
+  virtual void CheckInit();
+  virtual void CheckSetFileList();
+  virtual void CheckStart();
+  virtual void SetBatchSize(
+      int batch);  // batch size will be set in Init() function
+  // This function is used to pick one file from the global filelist(thread
+  // safe).
+  virtual bool PickOneFile(std::string* filename);
+
+  static std::vector<std::string> filelist_;
+  static size_t file_idx_;
+  static std::mutex mutex_for_pick_file_;
+
+  // the alias of used slots, and its order is determined by
+  // data_feed_desc(proto object)
+  std::vector<std::string> use_slots_;
+  std::vector<bool> use_slots_is_dense_;
+
+  // the alias of all slots, and its order is determined by data_feed_desc(proto
+  // object)
+  std::vector<std::string> all_slots_;
+  std::vector<std::string> all_slots_type_;
+  std::vector<int>
+      use_slots_index_;  // -1: not used; >=0: the index of use_slots_
+
+  // The data read by DataFeed will be stored here
+  std::vector<MixTensor> feed_vec_;
+
+  // the batch size defined by user
+  int default_batch_size_;
+  // current batch size
+  int batch_size_;
+
+  bool finish_init_;
+  static bool finish_set_filelist_;
+  bool finish_start_;
+};
+
+// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
+// It use a read-thread to read file and parse data to a private-queue
+// (thread level), and get data from this queue when trainer call Next().
+template <typename T>
+class PrivateQueueDataFeed : public DataFeed {
+ public:
+  PrivateQueueDataFeed() {}
+  virtual ~PrivateQueueDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
+  virtual bool Start();
+  virtual int Next();
+
+ protected:
+  // The thread implementation function for reading file and parse.
+  virtual void ReadThread();
+  // This function is used to set private-queue size, and the most
+  // efficient when the queue size is close to the batch size.
+  virtual void SetQueueSize(int queue_size);
+  // The reading and parsing method called in the ReadThread.
+  virtual bool ParseOneInstance(T* instance) = 0;
+  // This function is used to put instance to vec_ins
+  virtual void AddInstanceToInsVec(T* vec_ins, const T& instance,
+                                   int index) = 0;
+  // This function is used to put ins_vec to feed_vec
+  virtual void PutToFeedVec(const T& ins_vec) = 0;
+
+  // The thread for read files
+  std::thread read_thread_;
+  // using ifstream one line and one line parse is faster
+  // than using fread one buffer and one buffer parse.
+  //   for a 601M real data:
+  //     ifstream one line and one line parse: 6034 ms
+  //     fread one buffer and one buffer parse: 7097 ms
+  std::ifstream file_;
+  size_t queue_size_;
+  // The queue for store parsed data
+  std::unique_ptr<paddle::operators::reader::BlockingQueue<T>> queue_;
+};
+
+// This class define the data type of instance(ins_vec) in MultiSlotDataFeed
+class MultiSlotType {
+ public:
+  MultiSlotType() {}
+  ~MultiSlotType() {}
+  void Init(const std::string& type) {
+    CheckType(type);
+    if (type_[0] == 'f') {
+      float_feasign_.clear();
+    } else if (type_[0] == 'u') {
+      uint64_feasign_.clear();
+    }
+    type_ = type;
+  }
+  void InitOffset() {
+    offset_.resize(1);
+    // LoDTensor' lod is counted from 0, the size of lod
+    // is one size larger than the size of data.
+    offset_[0] = 0;
+  }
+  const std::vector<size_t>& GetOffset() const { return offset_; }
+  void AddValue(const float v) {
+    CheckFloat();
+    float_feasign_.push_back(v);
+  }
+  void AddValue(const uint64_t v) {
+    CheckUint64();
+    uint64_feasign_.push_back(v);
+  }
+  void AddIns(const MultiSlotType& ins) {
+    if (ins.GetType()[0] == 'f') {  // float
+      CheckFloat();
+      auto& vec = ins.GetFloatData();
+      offset_.push_back(offset_.back() + vec.size());
+      float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end());
+    } else if (ins.GetType()[0] == 'u') {  // uint64
+      CheckUint64();
+      auto& vec = ins.GetUint64Data();
+      offset_.push_back(offset_.back() + vec.size());
+      uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end());
+    }
+  }
+  const std::vector<float>& GetFloatData() const { return float_feasign_; }
+  const std::vector<uint64_t>& GetUint64Data() const { return uint64_feasign_; }
+  const std::string& GetType() const { return type_; }
+
+ private:
+  void CheckType(const std::string& type) const {
+    PADDLE_ENFORCE((type == "uint64") || (type == "float"),
+                   "There is no this type<%s>.", type);
+  }
+  void CheckFloat() const {
+    PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_);
+  }
+  void CheckUint64() const {
+    PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_);
+  }
+  std::vector<float> float_feasign_;
+  std::vector<uint64_t> uint64_feasign_;
+  std::string type_;
+  std::vector<size_t> offset_;
+};
+
+// This DataFeed is used to feed multi-slot type data.
+// The format of multi-slot type data:
+//   [n feasign_0 feasign_1 ... feasign_n]*
+class MultiSlotDataFeed
+    : public PrivateQueueDataFeed<std::vector<MultiSlotType>> {
+ public:
+  MultiSlotDataFeed() {}
+  virtual ~MultiSlotDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc);
+  virtual bool CheckFile(const char* filename);
+
+ protected:
+  virtual void AddInstanceToInsVec(std::vector<MultiSlotType>* vec_ins,
+                                   const std::vector<MultiSlotType>& instance,
+                                   int index);
+  virtual bool ParseOneInstance(std::vector<MultiSlotType>* instance);
+  virtual void PutToFeedVec(const std::vector<MultiSlotType>& ins_vec);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
new file mode 100644
index 0000000000..489fec08d8
--- /dev/null
+++ b/paddle/fluid/framework/data_feed.proto
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+package paddle.framework;
+
+message Slot {
+  required string name = 1;
+  required string type = 2;
+  optional bool is_dense = 3 [ default = false ];
+  optional bool is_used = 4 [ default = false ];
+}
+
+message MultiSlotDesc { repeated Slot slots = 1; }
+
+message DataFeedDesc {
+  optional string name = 1;
+  optional int32 batch_size = 2 [ default = 32 ];
+  optional MultiSlotDesc multi_slot_desc = 3;
+}
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
new file mode 100644
index 0000000000..72148b9f7d
--- /dev/null
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/framework/data_feed.h"
+
+namespace paddle {
+namespace framework {
+typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
+typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
+data_feedMap g_data_feed_map;
+
+#define REGISTER_DATAFEED_CLASS(data_feed_class)                      \
+  namespace {                                                         \
+  std::shared_ptr<DataFeed> Creator_##data_feed_class() {             \
+    return std::shared_ptr<DataFeed>(new data_feed_class);            \
+  }                                                                   \
+  class __Registerer_##data_feed_class {                              \
+   public:                                                            \
+    __Registerer_##data_feed_class() {                                \
+      g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \
+    }                                                                 \
+  };                                                                  \
+  __Registerer_##data_feed_class g_registerer_##data_feed_class;      \
+  }  // namespace
+
+std::string DataFeedFactory::DataFeedTypeList() {
+  std::string data_feed_types;
+  for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end();
+       ++iter) {
+    if (iter != g_data_feed_map.begin()) {
+      data_feed_types += ", ";
+    }
+    data_feed_types += iter->first;
+  }
+  return data_feed_types;
+}
+
+std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
+    std::string data_feed_class) {
+  if (g_data_feed_map.count(data_feed_class) < 1) {
+    exit(-1);
+  }
+  return g_data_feed_map[data_feed_class]();
+}
+
+REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed_factory.h b/paddle/fluid/framework/data_feed_factory.h
new file mode 100644
index 0000000000..13678edb0b
--- /dev/null
+++ b/paddle/fluid/framework/data_feed_factory.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/data_feed.h"
+
+namespace paddle {
+namespace framework {
+class DataFeedFactory {
+ public:
+  static std::string DataFeedTypeList();
+  static std::shared_ptr<DataFeed> CreateDataFeed(std::string data_feed_class);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc
new file mode 100644
index 0000000000..3974f8dbad
--- /dev/null
+++ b/paddle/fluid/framework/data_feed_test.cc
@@ -0,0 +1,337 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/data_feed.h"
+#include <fcntl.h>
+#include <chrono>  // NOLINT
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <set>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+paddle::framework::DataFeedDesc load_datafeed_param_from_file(
+    const char* filename) {
+  paddle::framework::DataFeedDesc data_feed_desc;
+  int file_descriptor = open(filename, O_RDONLY);
+  PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename);
+  google::protobuf::io::FileInputStream fileInput(file_descriptor);
+  google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc);
+  close(file_descriptor);
+  return data_feed_desc;
+}
+
+const std::vector<std::string> load_filelist_from_file(const char* filename) {
+  std::vector<std::string> filelist;
+  std::ifstream fin(filename);
+  PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename);
+  std::string line;
+  while (getline(fin, line)) {
+    filelist.push_back(line);
+  }
+  fin.close();
+  return filelist;
+}
+
+void GenerateFileForTest(const char* protofile, const char* filelist) {
+  std::ofstream w_protofile(protofile);
+  w_protofile << "name: \"MultiSlotDataFeed\"\n"
+                 "batch_size: 2\n"
+                 "multi_slot_desc {\n"
+                 "    slots {\n"
+                 "        name: \"uint64_sparse_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"float_sparse_slot\"\n"
+                 "        type: \"float\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"uint64_dense_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: true\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"float_dense_slot\"\n"
+                 "        type: \"float\"\n"
+                 "        is_dense: true\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"not_used_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: false\n"
+                 "    }\n"
+                 "}";
+  w_protofile.close();
+  std::ofstream w_filelist(filelist);
+  int total_file = 4;
+  for (int i = 0; i < total_file; ++i) {
+    std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i);
+    w_filelist << filename;
+    if (i + 1 != total_file) {
+      w_filelist << std::endl;
+    }
+    std::ofstream w_datafile(filename.c_str());
+    w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n"
+                  "2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n"
+                  "1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n";
+    w_datafile.close();
+  }
+  w_filelist.close();
+}
+
+class MultiTypeSet {
+ public:
+  MultiTypeSet() {
+    uint64_set_.clear();
+    float_set_.clear();
+  }
+  ~MultiTypeSet() {}
+  void AddValue(uint64_t v) { uint64_set_.insert(v); }
+  void AddValue(float v) { float_set_.insert(v); }
+  const std::set<uint64_t>& GetUint64Set() const { return uint64_set_; }
+  const std::set<float>& GetFloatSet() const { return float_set_; }
+
+ private:
+  std::set<uint64_t> uint64_set_;
+  std::set<float> float_set_;
+};
+
+void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
+                          const paddle::framework::DataFeedDesc& data_feed_desc,
+                          const std::vector<std::string>& filelist,
+                          const int thread_num) {
+  int used_slot_num = 0;
+  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
+      ++used_slot_num;
+    }
+  }
+  reader_elem_set->resize(used_slot_num);
+  std::vector<std::thread> threads;
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers;
+  readers.resize(thread_num);
+  for (int i = 0; i < thread_num; ++i) {
+    readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed(
+        data_feed_desc.name());
+    readers[i]->Init(data_feed_desc);
+  }
+  readers[0]->SetFileList(filelist);
+  std::mutex mu;
+  for (int idx = 0; idx < thread_num; ++idx) {
+    threads.emplace_back(std::thread([&, idx] {
+      std::unique_ptr<paddle::framework::Scope> scope(
+          new paddle::framework::Scope());
+      const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
+      std::map<std::string, const paddle::framework::LoDTensor*>
+          lodtensor_targets;
+      std::map<std::string, const paddle::framework::Tensor*> tensor_targets;
+      for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
+        const auto& slot = multi_slot_desc.slots(i);
+        if (slot.is_used()) {
+          const auto& name = slot.name();
+          readers[idx]->AddFeedVar(scope->Var(name), name);
+          if (slot.is_dense()) {
+            tensor_targets[name] =
+                &scope->FindVar(name)->Get<paddle::framework::Tensor>();
+          } else {
+            lodtensor_targets[name] =
+                &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
+          }
+        }
+      }
+      readers[idx]->Start();
+      while (readers[idx]->Next()) {
+        int index = 0;
+        for (int k = 0; k < multi_slot_desc.slots_size(); ++k) {
+          const auto& slot = multi_slot_desc.slots(k);
+          if (!slot.is_used()) {
+            continue;
+          }
+          if (slot.is_dense()) {  // dense branch
+            const paddle::framework::Tensor* tens = tensor_targets[slot.name()];
+            if (slot.type() == "uint64") {
+              const int64_t* data = tens->data<int64_t>();
+              int batch_size = tens->dims()[0];
+              int dim = tens->dims()[1];
+              for (int i = 0; i < batch_size; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(
+                      (uint64_t)data[i * dim + j]);
+                }
+              }
+            } else if (slot.type() == "float") {
+              const float* data = tens->data<float>();
+              int batch_size = tens->dims()[0];
+              int dim = tens->dims()[1];
+              for (int i = 0; i < batch_size; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(data[i * dim + j]);
+                }
+              }
+            } else {
+              PADDLE_THROW("Error type in proto file.");
+            }
+          } else {  // sparse branch
+            const paddle::framework::LoDTensor* tens =
+                lodtensor_targets[slot.name()];
+            if (slot.type() == "uint64") {
+              const int64_t* data = tens->data<int64_t>();
+              for (size_t i = 0; i < tens->NumElements(); ++i) {
+                std::pair<size_t, size_t> element = tens->lod_element(0, i);
+                for (size_t j = element.first; j < element.second; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue((uint64_t)data[j]);
+                }
+              }
+            } else if (slot.type() == "float") {
+              const float* data = tens->data<float>();
+              for (size_t i = 0; i < tens->NumElements(); ++i) {
+                std::pair<size_t, size_t> element = tens->lod_element(0, i);
+                for (size_t j = element.first; j < element.second; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(data[j]);
+                }
+              }
+            } else {
+              PADDLE_THROW("Error type in proto file.");
+            }
+          }  // end sparse branch
+          ++index;
+        }  // end slots loop
+      }    // end while Next()
+    }));   // end anonymous function
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+void CheckIsUnorderedSame(const std::vector<MultiTypeSet>& s1,
+                          const std::vector<MultiTypeSet>& s2) {
+  EXPECT_EQ(s1.size(), s2.size());
+  for (size_t i = 0; i < s1.size(); ++i) {
+    // check for uint64
+    const std::set<uint64_t>& uint64_s1 = s1[i].GetUint64Set();
+    const std::set<uint64_t>& uint64_s2 = s2[i].GetUint64Set();
+    EXPECT_EQ(uint64_s1.size(), uint64_s2.size());
+    auto uint64_it1 = uint64_s1.begin();
+    auto uint64_it2 = uint64_s2.begin();
+    while (uint64_it1 != uint64_s1.end()) {
+      EXPECT_EQ(*uint64_it1, *uint64_it2);
+      ++uint64_it1;
+      ++uint64_it2;
+    }
+    // check for float
+    const std::set<float>& float_s1 = s1[i].GetFloatSet();
+    const std::set<float>& float_s2 = s2[i].GetFloatSet();
+    EXPECT_EQ(float_s1.size(), float_s2.size());
+    auto float_it1 = float_s1.begin();
+    auto float_it2 = float_s2.begin();
+    while (float_it1 != float_s1.end()) {
+      EXPECT_EQ(*float_it1, *float_it2);
+      ++float_it1;
+      ++float_it2;
+    }
+  }
+}
+
+void GetElemSetFromFile(std::vector<MultiTypeSet>* file_elem_set,
+                        const paddle::framework::DataFeedDesc& data_feed_desc,
+                        const std::vector<std::string>& filelist) {
+  int used_slot_num = 0;
+  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
+      ++used_slot_num;
+    }
+  }
+  file_elem_set->resize(used_slot_num);
+  for (const auto& file : filelist) {
+    std::ifstream fin(file.c_str());
+    PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str());
+    while (1) {
+      bool end_flag = false;
+      int index = 0;
+      for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+        int num;
+        if (fin >> num) {
+          auto slot = data_feed_desc.multi_slot_desc().slots(i);
+          auto type = slot.type();
+          if (type == "uint64") {
+            while (num--) {
+              uint64_t feasign;
+              fin >> feasign;
+              if (slot.is_used()) {
+                (*file_elem_set)[index].AddValue(feasign);
+              }
+            }
+          } else if (type == "float") {
+            while (num--) {
+              float feasign;
+              fin >> feasign;
+              if (slot.is_used()) {
+                (*file_elem_set)[index].AddValue(feasign);
+              }
+            }
+          } else {
+            PADDLE_THROW("Error type in proto file.");
+          }
+          if (slot.is_used()) {
+            ++index;
+          }
+        } else {
+          end_flag = true;
+          break;
+        }
+      }
+      if (end_flag) {
+        break;
+      }
+    }
+    fin.close();
+  }
+}
+
+TEST(DataFeed, MultiSlotUnitTest) {
+  const char* protofile = "data_feed_desc.prototxt";
+  const char* filelist_name = "filelist.txt";
+  GenerateFileForTest(protofile, filelist_name);
+  const std::vector<std::string> filelist =
+      load_filelist_from_file(filelist_name);
+  paddle::framework::DataFeedDesc data_feed_desc =
+      load_datafeed_param_from_file(protofile);
+  std::vector<MultiTypeSet> reader_elem_set;
+  std::vector<MultiTypeSet> file_elem_set;
+  GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4);
+  GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist);
+  CheckIsUnorderedSame(reader_elem_set, file_elem_set);
+}
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index d6b5ad4570..93288936fe 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -39,11 +39,12 @@ if (WITH_GPU)
 endif()
 
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
+cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
 
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
 
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass) 
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) 
 if (WITH_GPU)
   list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
 endif()
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
new file mode 100644
index 0000000000..fe21e21bcf
--- /dev/null
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/op_graph_view.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+static constexpr char kAllOpDescs[] = "all_op_descs";
+
+VarHandle* GetValidInput(const OpHandleBase* a) {
+  for (auto p : a->Inputs()) {
+    VarHandle* b = dynamic_cast<VarHandle*>(p);
+    if (b) {
+      return b;
+    }
+  }
+
+  return nullptr;
+}
+
+std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+
+  // get vars order
+  int order = 0;
+  std::unordered_map<std::string, int> vars;
+  // TODO(gongwb): use graph topology sort to find the order of operators.
+  //               Note that must assert topology sort is stable
+  auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  for (auto* op_desc : ops) {
+    auto outputs = op_desc->Outputs();
+    for (auto& o_it : outputs) {
+      for (auto& v : o_it.second) {  // values
+        vars[v] = order;
+      }
+    }
+    order++;
+  }
+
+  std::vector<OpHandleBase*> dist_ops;
+  // get allreduce ops.
+  for (auto& op : graph_ops) {
+    // FIXME(gongwb):add broad cast.
+    if (op->Name() == "all_reduce" || op->Name() == "reduce") {
+      dist_ops.push_back(op);
+    }
+  }
+
+  VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl;
+
+  std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
+                                                  OpHandleBase* op2) {
+    VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
+    VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
+
+    PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
+                   op1->DebugString(), op2->DebugString());
+
+    auto l_it = vars.find(i0->name_);
+    auto r_it = vars.find(i1->name_);
+
+    if (l_it->second < r_it->second) return true;
+
+    if (l_it->second == r_it->second) {
+      return i0->name_ < i1->name_;
+    }
+
+    return false;
+  });
+
+  // add dependency.
+  auto& sorted_ops = dist_ops;
+  for (size_t i = 1; i < sorted_ops.size(); ++i) {
+    auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+
+    auto* pre_op = sorted_ops[i - 1];
+    auto* op = sorted_ops[i];
+
+    pre_op->AddOutput(dep_var);
+    op->AddInput(dep_var);
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+
+    VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
+             << " and " << op;
+
+    VLOG(10) << "pre_op:" << pre_op->DebugString()
+             << ", op:" << op->DebugString();
+  }
+
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(all_reduce_deps_pass,
+              paddle::framework::details::AllReduceDepsPass)
+    .RequirePassAttr(paddle::framework::details::kAllOpDescs);
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h
new file mode 100644
index 0000000000..e8b9108981
--- /dev/null
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// TODO(gongwb): overlap allreduce with backward computation.
+class AllReduceDepsPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index b869015676..a003995ae3 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -23,7 +23,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
@@ -74,7 +74,7 @@ void AllReduceOpHandle::RunImpl() {
     }
 
     if (platform::is_gpu_place(lod_tensors[0]->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
       int dtype = -1;
       size_t numel = 0;
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index f6ef3a1367..b449796fca 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -29,7 +29,7 @@ namespace framework {
 namespace details {
 
 struct AllReduceOpHandle : public OpHandleBase {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
                     const platform::NCCLContextMap *ctxs);
@@ -49,7 +49,7 @@ struct AllReduceOpHandle : public OpHandleBase {
  private:
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
 };
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 8e5e542765..cf280c29ff 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -60,7 +60,7 @@ void BroadcastOpHandle::BroadcastOneVar(
   PADDLE_ENFORCE_NOT_NULL(in_var);
   Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
   if (UNLIKELY(!in_tensor.IsInitialized())) {
-    VLOG(30) << "in var " << in_var_handle.name_ << "not inited, return!";
+    VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!";
     return;
   }
 
@@ -82,7 +82,7 @@ void BroadcastOpHandle::BroadcastOneVar(
       });
     }
   } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     VarHandle *out_handle = nullptr;
     int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
     std::vector<std::function<void()>> broadcast_calls;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 72180fac86..0c75e05f86 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -34,7 +34,7 @@ namespace details {
 
 struct BroadcastOpHandle : public OpHandleBase {
  public:
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
                     const platform::NCCLContextMap *nccl_ctxs)
@@ -68,7 +68,7 @@ struct BroadcastOpHandle : public OpHandleBase {
 
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 4305eb6573..df3b3cc9ca 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -42,7 +42,7 @@ struct TestBroadcastOpHandle {
   std::vector<std::unique_ptr<ir::Node>> nodes_;
   std::vector<p::Place> place_list_;
   bool use_gpu_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
 
@@ -50,7 +50,7 @@ struct TestBroadcastOpHandle {
     for (size_t j = 0; j < ctxs_.size(); ++j) {
       ctxs_[j]->Wait();
     }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     if (nccl_ctxs_) {
       nccl_ctxs_->WaitAll();
     }
@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
   void InitCtxOnGpu(bool use_gpu) {
     use_gpu_ = use_gpu;
     if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       int count = p::GetCUDADeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -84,7 +84,7 @@ struct TestBroadcastOpHandle {
         place_list_.push_back(p);
         ctxs_.emplace_back(new p::CPUDeviceContext(p));
       }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       nccl_ctxs_.reset(nullptr);
 #endif
     }
@@ -106,14 +106,14 @@ struct TestBroadcastOpHandle {
     nodes_.emplace_back(
         ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
     if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                          place_list_, nccl_ctxs_.get());
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                          place_list_, nccl_ctxs_.get());
 #else
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 37202f8695..523f9eadf2 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
@@ -24,6 +25,10 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
+  return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1);
+}
+
 class ParallelExecutorPassBuilder : public ir::PassBuilder {
  public:
   explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
@@ -70,6 +75,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // Verify that the graph is correct for multi-device executor.
     AppendPass("multi_devices_check_pass");
 
+    if (SeqOnlyAllReduceOps(strategy)) {
+      AppendPass("all_reduce_deps_pass");
+    }
+
     if (strategy_.remove_unnecessary_lock_) {
       AppendPass("modify_op_lock_and_record_event_pass");
     }
@@ -96,7 +105,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &param_names,
     const std::vector<Scope *> &local_scopes,
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
 #else
     const bool use_cuda) const {
@@ -118,12 +127,23 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->Erase("local_scopes");
       pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
                                                     &local_scopes);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
       pass->Erase("nccl_ctxs");
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
     } else if (pass->Type() == "sequential_execution_pass") {
+      VLOG(1) << "set enable_sequential_execution:"
+              << enable_sequential_execution_;
+
+      pass->Erase(kAllOpDescs);
+      pass->Set<const std::vector<OpDesc *>>(
+          kAllOpDescs,
+          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
+    } else if (pass->Type() == "all_reduce_deps_pass") {
+      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
+              << ", num_trainers:" << num_trainers_;
+
       pass->Erase(kAllOpDescs);
       pass->Set<const std::vector<OpDesc *>>(
           kAllOpDescs,
@@ -144,4 +164,5 @@ USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
 USE_PASS(sequential_execution_pass);
+USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index fc2641dbd4..9f0a259128 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -23,7 +23,7 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -73,6 +73,7 @@ struct BuildStrategy {
 
   bool fuse_broadcast_op_{false};
 
+  int num_trainers_{1};
   bool remove_unnecessary_lock_{false};
 
   // NOTE:
@@ -98,7 +99,7 @@ struct BuildStrategy {
       const std::string &loss_var_name,
       const std::unordered_set<std::string> &param_names,
       const std::vector<Scope *> &local_scopes,
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
 #else
       const bool use_cuda) const;
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 0b772f9b63..cc562c7b10 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 DataBalanceOpHandle::DataBalanceOpHandle(
     ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
index 0462fb6ec7..2db18a1a72 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -29,7 +29,7 @@ namespace details {
 
 struct DataBalanceOpHandle : public OpHandleBase {
  public:
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                       const std::vector<platform::Place> &places,
                       const platform::NCCLContextMap *ctxs);
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index e37259526a..e43d545c9c 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -25,7 +25,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -35,7 +35,7 @@ namespace details {
 
 struct FusedBroadcastOpHandle : public BroadcastOpHandle {
  public:
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   FusedBroadcastOpHandle(ir::Node *node,
                          const std::vector<Scope *> local_scopes,
                          const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index 541993c743..be0d941c4f 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -44,14 +44,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
     nodes_.emplace_back(
         ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
     if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_ = new FusedBroadcastOpHandle(
           nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
       PADDLE_THROW("CUDA is not supported.");
 #endif
     } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_ = new FusedBroadcastOpHandle(
           nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
diff --git a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
index bf3f3637b5..67aad9f94f 100644
--- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
+++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
@@ -45,8 +45,8 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
         IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view);
     compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free);
     if (is_lock_and_record_event_free) {
-      VLOG(100) << "Set is_lock_and_record_event_free be true in op "
-                << compute_op->DebugString();
+      VLOG(10) << "Set is_lock_and_record_event_free be true in op "
+               << compute_op->DebugString();
     }
   }
   return ir_graph;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 8c98b78130..03f5f2e73a 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -142,7 +142,7 @@ void MultiDevSSAGraphBuilder::Init() const {
   places_ = Get<const std::vector<platform::Place>>(kPlaces);
   local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
   strategy_ = Get<const BuildStrategy>(kStrategy);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   nccl_ctxs_ = &Get<platform::NCCLContextMap>("nccl_ctxs");
 #endif
 
@@ -399,7 +399,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
               for (size_t i = 0; i < backward_vars.size(); i += 2) {
                 auto &p_name = backward_vars[i];
                 auto &g_name = backward_vars[i + 1];
-                VLOG(100) << "Bcast " << g_name << " for parameter " << p_name;
+                VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
 
                 switch (strategy_.reduce_) {
                   case BuildStrategy::ReduceStrategy::kReduce:
@@ -431,7 +431,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
     }
   }
   bool use_gpu = false;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   use_gpu = nccl_ctxs_ != nullptr;
 #endif
 
@@ -478,7 +478,7 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
 
 void MultiDevSSAGraphBuilder::SetCommunicationContext(
     OpHandleBase *op_handle, const platform::Place &p) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   if (nccl_ctxs_ == nullptr) {
     op_handle->SetDeviceContext(p,
                                 platform::DeviceContextPool::Instance().Get(p));
@@ -492,7 +492,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
 void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
                                                 const std::string &p_name,
                                                 size_t src_dev_id) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   auto *op_handle = new BroadcastOpHandle(
       result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_);
@@ -522,7 +522,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result,
 void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp(
     ir::Graph *result,
     const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   auto *op_handle = new FusedBroadcastOpHandle(
       result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_);
@@ -568,7 +568,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
 
 void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
                                                 const std::string &og) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
       result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
@@ -597,7 +597,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result,
 
 void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
     ir::Graph *result, const std::vector<std::string> &datas) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new DataBalanceOpHandle(
       result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
@@ -694,7 +694,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
 VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
                                                    const std::string &og,
                                                    int dst_dev_id) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   result->Get<GraphOps>(kGraphOps).emplace_back(new ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_, places_, nccl_ctxs_));
@@ -809,8 +809,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
           node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
       PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
       op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
-      VLOG(100) << "send grad " << input_var_names[0] << " origin "
-                << send_param_grad[1] << " place: " << op_dev_id;
+      VLOG(10) << "send grad " << input_var_names[0] << " origin "
+               << send_param_grad[1] << " place: " << op_dev_id;
       for (auto &varname : input_var_names) {
         sharded_var_device->emplace(varname, op_dev_id);
       }
@@ -826,9 +826,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
     if (recv_param_grad.size() == 2U) {
       op_dev_id =
           GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device);
-      VLOG(100) << "recv param " << recv_param_grad[0]
-                << " get grad place: " << recv_param_grad[1]
-                << " place: " << op_dev_id;
+      VLOG(10) << "recv param " << recv_param_grad[0]
+               << " get grad place: " << recv_param_grad[1]
+               << " place: " << op_dev_id;
     } else {
       op_dev_id = GetAppropriateDeviceID(output_var_names);
     }
@@ -862,7 +862,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
       if (node->Op()->Type() == "fetch_barrier") {
         outvar_dev_id =
             GetVarDeviceID(*result, output->Name(), *sharded_var_device);
-        PADDLE_ENFORCE_NE(outvar_dev_id, -1);
+        PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
       }
       p = places_[outvar_dev_id];
       ir::Node *new_node = nullptr;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index f3ec2d2941..8e462aec7d 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -40,7 +40,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                          size_t device_id) const;
   void Init() const;
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 4503123eac..c9f1107aea 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -125,7 +125,7 @@ void ReduceOpHandle::RunImpl() {
         }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       auto pre_in = pre_in_var->Get<framework::LoDTensor>();
       VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
       VariableVisitor::GetMutableTensor(out_var).mutable_data(
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 999828ae45..846839029c 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/device_context.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -35,7 +35,7 @@ struct ReduceOpHandle : public OpHandleBase {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   const platform::NCCLContextMap *nccl_ctxs_;
   ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 72299c0bfa..6cee4770e6 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -35,7 +35,7 @@ struct TestReduceOpHandle {
   std::vector<p::Place> gpu_list_;
   std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
 
@@ -43,7 +43,7 @@ struct TestReduceOpHandle {
     for (size_t j = 0; j < ctxs_.size(); ++j) {
       ctxs_[j]->Wait();
     }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     if (nccl_ctxs_) {
       nccl_ctxs_->WaitAll();
     }
@@ -53,7 +53,7 @@ struct TestReduceOpHandle {
   void InitCtxOnGpu(bool use_gpu) {
     use_gpu_ = use_gpu;
     if (use_gpu) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       int count = p::GetCUDADeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -77,7 +77,7 @@ struct TestReduceOpHandle {
         gpu_list_.push_back(p);
         ctxs_.emplace_back(new p::CPUDeviceContext(p));
       }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       nccl_ctxs_.reset(nullptr);
 #endif
     }
@@ -99,14 +99,14 @@ struct TestReduceOpHandle {
 
     nodes.emplace_back(new ir::Node("node"));
     if (use_gpu_) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                           gpu_list_, nccl_ctxs_.get()));
 #else
       PADDLE_THROW("CUDA is not support.");
 #endif
     } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                           gpu_list_, nccl_ctxs_.get()));
 #else
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 28443cc886..08783fb5f8 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -140,8 +140,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
         if (next_compute_op != nullptr) {
           if (compute_ref_cnt_map.count(next_compute_op)) {
             compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
-            VLOG(50) << "Add reference count of " << var_name << " to Operator "
-                     << next_compute_op->Name();
+            VLOG(5) << "Add reference count of " << var_name << " to Operator "
+                    << next_compute_op->Name();
           } else {
             // Create new reference_count_op_handle
             ir::Node *ref_cnt_node = graph->CreateEmptyNode(
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 6ab6cb2332..ef16265997 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
                         ->stream();
       memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                    platform::CPUPlace(), &coeff_, sizeof(float), stream);
-      VLOG(100) << place_ << "RUN Scale loss grad op";
+      VLOG(10) << place_ << "RUN Scale loss grad op";
     });
 #endif
   }
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index e5b1eaa731..499246a985 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -16,7 +16,7 @@
 #include <stdexcept>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/details/reference_count_op_handle.h"
diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc
index f78a47bb78..cc2c8bfef9 100644
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -94,8 +94,8 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
     op_node_list[i - 1]->outputs.push_back(dep_var);
     dep_var->outputs.push_back(op_node_list[i]);
     dep_var->inputs.push_back(op_node_list[i - 1]);
-    VLOG(100) << "Add dependencies between " << op_node_list[i - 1]->Name()
-              << " and " << op_node_list[i]->Name();
+    VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
+             << " and " << op_node_list[i]->Name();
   }
   return graph;
 }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index f781f02a07..677a293794 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -210,16 +210,16 @@ void ThreadedSSAGraphExecutor::RunOp(
     details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
     try {
-      if (VLOG_IS_ON(100)) {
-        VLOG(100) << op << " " << op->Name() << " : " << op->DebugString();
+      if (VLOG_IS_ON(10)) {
+        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
       }
       if (LIKELY(!strategy_.dry_run_)) {
         op->Run(strategy_.use_cuda_);
       }
-      VLOG(100) << op << " " << op->Name() << " Done ";
+      VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
-      VLOG(100) << op << " " << op->Name() << "Signal posted";
+      VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (...) {
       exception_holder_.Catch(std::current_exception());
     }
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 3dc571d757..73cec21e20 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -46,7 +47,7 @@ ExecutorPrepareContext::ExecutorPrepareContext(
 }
 
 ExecutorPrepareContext::~ExecutorPrepareContext() {
-  VLOG(50) << "destroy ExecutorPrepareContext";
+  VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
 template <typename RefCntMap>
@@ -63,7 +64,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
         if ((it->second)-- == 1) {
           auto* var = scope.FindVar(name);
           if (var != nullptr) {
-            VLOG(100) << "Erase tensor \'" << name << "\'";
+            VLOG(10) << "Erase tensor \'" << name << "\'";
             if (var->IsType<LoDTensor>()) {
               erase_tensors.insert(var->GetMutable<LoDTensor>());
             } else if (var->IsType<SelectedRows>()) {
@@ -114,36 +115,6 @@ void Executor::Close() {
 #endif
 }
 
-void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope*>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
-        var_type);
-  }
-}
-
 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
                                int block_id) {
   auto& global_block = pdesc.Block(block_id);
@@ -162,21 +133,21 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
       if (var->Persistable()) {
         auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
         InitializeVariable(ptr, var->GetType());
-        VLOG(30) << "Create Variable " << var->Name()
-                 << " global, which pointer is " << ptr;
+        VLOG(3) << "Create Variable " << var->Name()
+                << " global, which pointer is " << ptr;
       } else {
         auto* ptr = scope->Var(var->Name());
         InitializeVariable(ptr, var->GetType());
-        VLOG(30) << "Create Variable " << var->Name()
-                 << " locally, which pointer is " << ptr;
+        VLOG(3) << "Create Variable " << var->Name()
+                << " locally, which pointer is " << ptr;
       }
     }
   } else {
     for (auto& var : global_block.AllVars()) {
       auto* ptr = scope->Var(var->Name());
       InitializeVariable(ptr, var->GetType());
-      VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
-               << ptr;
+      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+              << ptr;
     }
   }
 }
@@ -307,7 +278,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     int i = 0;
     for (auto& feed_target : (*feed_targets)) {
       std::string var_name = feed_target.first;
-      VLOG(30) << "feed target's name: " << var_name;
+      VLOG(3) << "feed target's name: " << var_name;
 
       // prepend feed op
       auto* op = global_block->PrependOp();
@@ -330,7 +301,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     int i = 0;
     for (auto& fetch_target : (*fetch_targets)) {
       std::string var_name = fetch_target.first;
-      VLOG(30) << "fetch target's name: " << var_name;
+      VLOG(3) << "fetch target's name: " << var_name;
 
       // append fetch op
       auto* op = global_block->AppendOp();
@@ -482,7 +453,7 @@ void Executor::RunPreparedContext(
 
 void Executor::EnableMKLDNN(const ProgramDesc& program) {
 #ifdef PADDLE_WITH_MKLDNN
-  VLOG(30) << "use_mkldnn=True";
+  VLOG(3) << "use_mkldnn=True";
   for (size_t bid = 0; bid < program.Size(); ++bid) {
     auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
     for (auto* op : block->AllOps()) {
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 36b36d49c2..2d47903ffb 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -26,7 +26,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
 
 template <typename T>
 std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
new file mode 100644
index 0000000000..4e4001e979
--- /dev/null
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+namespace paddle {
+namespace framework {
+
+void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+  op_names_.clear();
+  for (auto& op_desc : block.AllOps()) {
+    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
+    op_names_.push_back(op_desc->Type());
+    OperatorBase* local_op_ptr = local_op.release();
+    ops_.push_back(local_op_ptr);
+    continue;
+  }
+}
+
+void ExecutorThreadWorker::CreateThreadResource(
+    const framework::ProgramDesc& program,
+    const paddle::platform::Place& place) {
+  CreateThreadScope(program);
+  CreateThreadOperators(program);
+  SetMainProgram(program);
+  SetPlace(place);
+}
+
+void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+
+  PADDLE_ENFORCE_NOT_NULL(
+      root_scope_, "root_scope should be set before creating thread scope");
+
+  thread_scope_ = &root_scope_->NewScope();
+  for (auto& var : block.AllVars()) {
+    if (var->Persistable()) {
+      auto* ptr = root_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+    } else {
+      auto* ptr = thread_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+    }
+  }
+}
+
+void ExecutorThreadWorker::SetDataFeed(
+    const std::shared_ptr<DataFeed>& datafeed) {
+  thread_reader_ = datafeed;
+}
+
+void ExecutorThreadWorker::BindingDataFeedMemory() {
+  const std::vector<std::string>& input_feed =
+      thread_reader_->GetUseSlotAlias();
+  for (auto name : input_feed) {
+    thread_reader_->AddFeedVar(thread_scope_->Var(name), name);
+  }
+}
+
+void ExecutorThreadWorker::SetFetchVarNames(
+    const std::vector<std::string>& fetch_var_names) {
+  fetch_var_names_.clear();
+  fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(),
+                          fetch_var_names.end());
+}
+
+void ExecutorThreadWorker::SetDevice() {
+#if defined _WIN32 || defined __APPLE__
+  return;
+#else
+  static unsigned concurrency_cap = std::thread::hardware_concurrency();
+  int thread_id = this->thread_id_;
+
+  if (thread_id < concurrency_cap) {
+    unsigned proc = thread_id;
+
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(proc, &mask);
+
+    if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) {
+      VLOG(1) << "WARNING: Failed to set thread affinity for thread "
+              << thread_id;
+    } else {
+      CPU_ZERO(&mask);
+      if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) ||
+          (CPU_ISSET(proc, &mask) == 0)) {
+        VLOG(3) << "WARNING: Failed to set thread affinity for thread "
+                << thread_id;
+      }
+    }
+  } else {
+    VLOG(1) << "WARNING: Failed to set thread affinity for thread "
+            << thread_id;
+  }
+#endif
+}
+
+template <typename T>
+void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) {
+  auto inspect = lod_tensor.data<T>();
+  auto element_num = lod_tensor.numel();
+
+  std::ostringstream sstream;
+  sstream << var_name << " (element num " << element_num << "): [";
+  sstream << inspect[0];
+  for (int j = 1; j < element_num; ++j) {
+    sstream << " " << inspect[j];
+  }
+  sstream << "]";
+
+  std::cout << sstream.str() << std::endl;
+}
+
+void print_fetch_var(Scope* scope, std::string var_name) {
+  const LoDTensor& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
+
+  if (std::type_index(tensor.type()) ==
+      std::type_index(typeid(platform::float16))) {
+    print_lod_tensor<platform::float16>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) {
+    print_lod_tensor<float>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(double))) {
+    print_lod_tensor<double>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) {
+    print_lod_tensor<int>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int64_t))) {
+    print_lod_tensor<int64_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) {
+    print_lod_tensor<bool>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(uint8_t))) {
+    print_lod_tensor<uint8_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int16_t))) {
+    print_lod_tensor<int16_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int8_t))) {
+    print_lod_tensor<int8_t>(var_name, tensor);
+  } else {
+    VLOG(1) << "print_fetch_var: unrecognized data type:"
+            << tensor.type().name();
+  }
+
+  return;
+}
+
+void ExecutorThreadWorker::TrainFiles() {
+  // todo: configurable
+  SetDevice();
+
+  int fetch_var_num = fetch_var_names_.size();
+  fetch_values_.clear();
+  fetch_values_.resize(fetch_var_num);
+
+  thread_reader_->Start();
+
+  int cur_batch;
+  int batch_cnt = 0;
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    // executor run here
+    for (auto& op : ops_) {
+      op->Run(*thread_scope_, place_);
+    }
+
+    ++batch_cnt;
+    thread_scope_->DropKids();
+
+    if (debug_ == false || thread_id_ != 0) {
+      continue;
+    }
+
+    for (int i = 0; i < fetch_var_num; ++i) {
+      print_fetch_var(thread_scope_, fetch_var_names_[i]);
+    }  // end for (int i = 0...)
+  }    // end while ()
+}
+
+void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; }
+
+void ExecutorThreadWorker::SetPlace(const platform::Place& place) {
+  place_ = place;
+}
+
+void ExecutorThreadWorker::SetMainProgram(
+    const ProgramDesc& main_program_desc) {
+  main_program_.reset(new ProgramDesc(main_program_desc));
+}
+
+void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
+  root_scope_ = g_scope;
+}
+
+}  // einit_modelnd namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h
new file mode 100644
index 0000000000..13ec2442c4
--- /dev/null
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+void CreateTensor(Variable* var, proto::VarType::Type var_type);
+
+class ExecutorThreadWorker {
+ public:
+  ExecutorThreadWorker()
+      : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
+  ~ExecutorThreadWorker() {}
+
+  void CreateThreadResource(const framework::ProgramDesc& program,
+                            const paddle::platform::Place& place);
+  void SetThreadId(int tid);
+  void SetDebug(const bool debug) { debug_ = debug; }
+  void SetRootScope(Scope* g_scope);
+  // set cpu device in this function
+  // cpu binding is used by default
+  void SetDevice();
+  // since we read data into memory that can not be accessed by program
+  // we need to bind memory of data with corresponding variables in program
+  // this function should be called after data feed is set
+  void BindingDataFeedMemory();
+  // set data feed declared in executor
+  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
+  // A multi-thread training function
+  void TrainFiles();
+  // set fetch variable names from python interface assigned by users
+  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
+
+ private:
+  void CreateThreadScope(const framework::ProgramDesc& program);
+  void CreateThreadOperators(const framework::ProgramDesc& program);
+  void SetMainProgram(const ProgramDesc& main_program_desc);
+  void SetPlace(const paddle::platform::Place& place);
+
+ protected:
+  // thread index
+  std::shared_ptr<DataFeed> thread_reader_;  // shared queue, thread buffer
+  int thread_id_;
+  // operator name
+  std::vector<std::string> op_names_;
+  // thread level, local operators for forward and backward
+  std::vector<OperatorBase*> ops_;
+  // main program for training
+  std::unique_ptr<framework::ProgramDesc> main_program_;
+  // execution place
+  platform::Place place_;
+  // root scope for model parameters
+  Scope* root_scope_;
+  // a thread scope, father scope is global score which is shared
+  Scope* thread_scope_;
+
+ private:
+  std::vector<std::string> fetch_var_names_;
+  std::vector<std::vector<float>> fetch_values_;
+  bool debug_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 1f3c19c0d5..3e9353f5cf 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -25,7 +25,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
                      const std::string& var_name, size_t index) {
   // If var_name Variable is not found in GlobalScope, a new variable will
   // be created.
-  VLOG(30) << "SetFeedVariable name=" << var_name << " index=" << index;
+  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
   Variable* g_feed_value = scope->Var(var_name);
   auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
   if (index >= feed_inputs.size()) {
@@ -47,8 +47,8 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
                  typeid(FeedFetchList).name());
   auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
   auto& tensor = fetch_outputs[index];
-  VLOG(30) << "Fetch " << var_name << " with index " << index
-           << " shape= " << tensor.dims();
+  VLOG(3) << "Fetch " << var_name << " with index " << index
+          << " shape= " << tensor.dims();
   PADDLE_ENFORCE_LT(index, fetch_outputs.size());
   return tensor;
 }
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index c436dd414d..a9897e0bb8 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -147,19 +147,19 @@ void PrepareParameters(Graph* graph, const Param& param) {
   scope->Var(param.LSTMX)->GetMutable<LoDTensor>();
   scope->Var(param.LSTMOUT)->GetMutable<LoDTensor>();
 
-#define GATE_W(name__)                                                \
-  auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");             \
-  auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");             \
-  auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");             \
-  CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);        \
-  VLOG(40) << #name__ "_w0"                                           \
-           << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
-  VLOG(40) << #name__ "_w1"                                           \
-           << " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
-  VLOG(40) << #name__ "_b0"                                           \
-           << " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
-  auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();        \
-  auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();        \
+#define GATE_W(name__)                                               \
+  auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");            \
+  auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");            \
+  auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");            \
+  CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);       \
+  VLOG(4) << #name__ "_w0"                                           \
+          << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
+  VLOG(4) << #name__ "_w1"                                           \
+          << " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
+  VLOG(4) << #name__ "_b0"                                           \
+          << " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
+  auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();       \
+  auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();       \
   auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
 
   GATE_W(forget);
@@ -208,7 +208,7 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
   int D = W_forget_w0.dims()[0];
   int M = W_forget_w1.dims()[0];
   out->Resize(make_ddim({D + M, 4 * D}));
-  VLOG(30) << "LSTMWeight resized to " << out->dims();
+  VLOG(3) << "LSTMWeight resized to " << out->dims();
 
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
   std::array<const float*, 4> tensors{
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
index c9c4d5afe5..449cc78be1 100644
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
@@ -57,7 +57,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
   int found_conv_bias_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(40) << "handle ConvBias fuse";
+    VLOG(4) << "handle ConvBias fuse";
     GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
                               conv_bias_pattern);                      // Filter
     GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern);  // tmp
@@ -74,7 +74,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
     // check if fuse can be done and if MKL-DNN should be used
     FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
     if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
-      VLOG(30) << "do not perform conv+bias fuse";
+      VLOG(3) << "do not perform conv+bias fuse";
       return;
     }
 
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 34b4c26ae3..846a14e365 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -121,7 +121,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
   int found_conv_bn_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(40) << "handle ConvBN fuse";
+    VLOG(4) << "handle ConvBN fuse";
 
     // conv, batch_norm,
     // conv_weight, conv_out,
@@ -133,7 +133,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
     // check if fuse can be done and if MKL-DNN should be used
     FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
     if (fuse_option == DO_NOT_FUSE) {
-      VLOG(30) << "do not perform conv+bn fuse";
+      VLOG(3) << "do not perform conv+bn fuse";
       return;
     }
 
@@ -241,7 +241,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
   int found_conv_bn_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(40) << "handle ConvBN fuse";
+    VLOG(4) << "handle ConvBN fuse";
 
     // conv, batch_norm,
     // conv_weight, conv_out,
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
index 048868e1f9..e359a3832e 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
   int found_conv_relu_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(40) << "handle ConvReLU fuse";
+    VLOG(4) << "handle ConvReLU fuse";
     GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
                               conv_relu_pattern);                      // Filter
     GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);  // tmp
@@ -48,7 +48,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
 
     FuseOptions fuse_option = FindFuseOption(*conv, *relu);
     if (fuse_option == DO_NOT_FUSE) {
-      VLOG(30) << "do not perform conv+relu fuse";
+      VLOG(3) << "do not perform conv+relu fuse";
       return;
     }
 
diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
index 5f3334578d..19056e18aa 100644
--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
   int found_depthwise_conv_mkldnn_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(30) << "handle DepthwiseConvMKLDNN fuse";
+    VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
     GET_NODE(depthwise_conv, (*pattern));
     depthwise_conv->Op()->SetType("conv2d");
     found_depthwise_conv_mkldnn_count++;
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 7b6ce0da07..26eac93905 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
   int found_fc_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(40) << "handle FC fuse";
+    VLOG(4) << "handle FC fuse";
     GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
index 8ed68905be..648acc4a75 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
 
   auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                      Graph *g) {
-    VLOG(40) << "handle FuseElewiseAddAct fuse";
+    VLOG(4) << "handle FuseElewiseAddAct fuse";
     GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
                               elewise_add_act_pattern);
@@ -77,10 +77,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
     Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
         g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n);
 
-    VLOG(40) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
-             << ele_add->Name() << " -> " << ele_out_n << "\n"
-             << "\t " << ele_out_n << " -> " << act->Name() << " -> "
-             << act_out_n;
+    VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
+            << ele_add->Name() << " -> " << ele_out_n << "\n"
+            << "\t " << ele_out_n << " -> " << act->Name() << " -> "
+            << act_out_n;
 
     ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node);
     found_elewise_add_act_count++;
@@ -113,7 +113,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
 
   auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                      Graph *g) {
-    VLOG(40) << "handle FuseElewiseAddAct fuse";
+    VLOG(4) << "handle FuseElewiseAddAct fuse";
     GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
@@ -129,9 +129,9 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
     Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
         g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n);
 
-    VLOG(40) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
-             << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
-             << ele_add->Name() << " -> " << elewise_add_out_n;
+    VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
+            << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
+            << ele_add->Name() << " -> " << elewise_add_out_n;
 
     ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node);
     found_elewise_add_act_count++;
@@ -165,7 +165,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
 
   auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                      Graph *g) {
-    VLOG(40) << "handle FuseElewiseAddActGrad1 fuse";
+    VLOG(4) << "handle FuseElewiseAddActGrad1 fuse";
     GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out,
@@ -208,10 +208,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
 
     auto fused_node = g->CreateOpNode(&desc);
 
-    VLOG(40) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
-             << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
-             << d_itermediate_out_n << " and " << act_out_n << " -> "
-             << ele_add_grad->Name() << " -> " << d_itermediate_out_n;
+    VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
+            << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
+            << d_itermediate_out_n << " and " << act_out_n << " -> "
+            << ele_add_grad->Name() << " -> " << d_itermediate_out_n;
 
     ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node);
     found_elewise_add_act_count++;
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index ae0e42ff5e..fc91564bba 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -90,7 +90,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
 
 std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
     const ProgramDesc &program) {
-  VLOG(30) << "block in program:" << program_.Size();
+  VLOG(3) << "block in program:" << program_.Size();
   std::unordered_map<std::string, VarDesc *> all_vars;
   // var nodes for each var name, will have multiple versions in SSA
   std::map<std::string, std::vector<ir::Node *>> var_nodes;
@@ -158,7 +158,7 @@ void Graph::ResolveHazard(
     auto it_old = versions.rbegin();
     ++it_old;
     for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
-      VLOG(30) << "deal with var: " << (*it_new)->Name();
+      VLOG(3) << "deal with var: " << (*it_new)->Name();
       ir::Node *write_op =
           (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
       const auto &read_ops = (*it_old)->outputs;
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 0c856f8e61..947c934f0f 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -89,7 +89,7 @@ class Graph {
                    attr_name);
     attrs_[attr_name] = attr;
     attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(30) << "deleting " << attr_name;
+      VLOG(3) << "deleting " << attr_name;
       delete attr;
     };
   }
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 963179192f..d2d28793c4 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -40,9 +40,8 @@ void SortHelper(
     }
   }
 
-  VLOG(30) << "topology sort insert: " << node->Name()
-           << reinterpret_cast<void *>(node) << " input "
-           << node->inputs.size();
+  VLOG(3) << "topology sort insert: " << node->Name()
+          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
   ret->push_back(node);
 }
 
@@ -111,9 +110,9 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     for (auto &var : n->inputs) {
       for (auto &adj_n : var->inputs) {
         PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
-        VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
-                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
-                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
+        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                << "  via " << var->Name() << reinterpret_cast<void *>(var);
         adj_list[n].insert(adj_n);
       }
     }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index f1f971656a..258182b25a 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -92,19 +92,19 @@ void GraphPatternDetector::operator()(Graph *graph,
   PrettyLogEndl(Style::detail(), "---  detect %d subgraphs", subgraphs.size());
   int id = 0;
   for (auto &g : subgraphs) {
-    VLOG(30) << "optimizing #" << id++ << " subgraph";
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
     handler(g, graph);
   }
 }
 
 bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
-  VLOG(30) << "mark pdnodes in graph";
+  VLOG(3) << "mark pdnodes in graph";
   if (graph.Nodes().empty()) return false;
 
   for (auto &node : GraphTraits::DFS(graph)) {
     for (const auto &pdnode : pattern_.nodes()) {
       if (pdnode->Tell(&node)) {
-        VLOG(40) << "pdnode " << pdnode->name() << " marked";
+        VLOG(4) << "pdnode " << pdnode->name() << " marked";
         pdnodes2nodes_[pdnode.get()].insert(&node);
       }
     }
@@ -112,7 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
   // Check to early stop if some PDNode can't find matched Node.
   for (auto &pdnode : pattern_.nodes()) {
     if (!pdnodes2nodes_.count(pdnode.get())) {
-      VLOG(40) << pdnode->name() << " can't find matched Node, early stop";
+      VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
       // return false;
     }
   }
@@ -121,7 +121,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
       GetMarkedNodes(const_cast<Graph *>(&graph)).insert(n);
     }
   }
-  VLOG(30) << pdnodes2nodes_.size() << " nodes marked";
+  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
 
   return !pdnodes2nodes_.empty();
 }
@@ -215,7 +215,7 @@ GraphPatternDetector::DetectPatterns() {
   // Extend a PDNode to subgraphs by deducing the connection relations defined
   // in edges of PDNodes.
   for (const auto &edge : pattern_.edges()) {
-    VLOG(40) << "check " << edge.first->name() << " -> " << edge.second->name();
+    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
     // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
     // Each role has two PDNodes, which indicates two roles.
     // Detect two Nodes that can match these two roles and they are connected.
@@ -226,7 +226,7 @@ GraphPatternDetector::DetectPatterns() {
     // source -> target
     for (Node *source : pdnodes2nodes_[edge.first]) {
       for (Node *target : pdnodes2nodes_[edge.second]) {
-        VLOG(80) << "check " << source->id() << " -- " << target->id();
+        VLOG(8) << "check " << source->id() << " -- " << target->id();
         // TODO(Superjomn) add some prune strategies.
         for (const auto &group : pre_groups) {
           if (IsNodesLink(source, target)) {
@@ -243,13 +243,12 @@ GraphPatternDetector::DetectPatterns() {
         }
       }
     }
-    VLOG(30) << "step " << step << " get records: " << cur_groups.size();
+    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
     for (auto &group : cur_groups) {
       for (auto &item : group.roles) {
-        VLOG(40) << "node " << item.second->id() << " as "
-                 << item.first->name();
+        VLOG(4) << "node " << item.second->id() << " as " << item.first->name();
       }
-      VLOG(40) << "=========================================================";
+      VLOG(4) << "=========================================================";
     }
   }
 
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 13dd354dc5..31ed98db72 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -41,7 +41,7 @@ std::string FormatName(const Node* node) {
 std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
-  VLOG(30) << "draw IR graph viz to " << graph_viz_path;
+  VLOG(3) << "draw IR graph viz to " << graph_viz_path;
   std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
   PADDLE_ENFORCE(fout->good());
   std::ostream& sout = *fout;
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 292f232ffc..6d8f020918 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
-      if (op->HasAttr("is_test")) {
+      if (n->RuntimeHasAttr("is_test")) {
         op->SetAttr("is_test", true);
       } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                  end(op_list)) {
diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc
index cd2cb0c9f8..d9a68c7f1d 100644
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@@ -15,7 +15,10 @@
 #include "paddle/fluid/framework/ir/is_test_pass.h"
 
 #include <gtest/gtest.h>
-
+#ifdef _WIN32
+#undef FALSE
+#undef TRUE
+#endif
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -101,9 +104,9 @@ TEST(IsTestPass, basic) {
       auto* op = node->Op();
       auto op_name = boost::get<std::string>(op->GetAttr("name"));
       if (op_name == "conv3") {
-        ASSERT_FALSE(op->HasAttr("is_test"));
+        ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
       } else {
-        ASSERT_TRUE(op->HasAttr("is_test"));
+        ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
         EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
       }
     }
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
index 145a3a455c..1cf1315d3d 100644
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -20,9 +20,9 @@ namespace ir {
 
 std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  VLOG(30) << "Aplies MKL-DNN placement strategy.";
+  VLOG(3) << "Aplies MKL-DNN placement strategy.";
   for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
+    if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
       n->Op()->SetAttr("use_mkldnn", true);
     }
   }
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index 532961e4d5..bd5b76426e 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -62,7 +62,7 @@ VarDesc UpdateGradVarDesc(
         string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat);
     VarDesc repeated_var = CopyVarDesc(var_desc);
     repeated_var.SetName(new_gname);
-    VLOG(30) << "update " << var_desc->Name() << " to repeat " << repeat;
+    VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat;
     return repeated_var;
   }
   return *var_desc;
@@ -78,7 +78,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
 
   std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
   auto origin_nodes = graph->ReleaseNodes();
-  VLOG(30) << "origin nodes count: " << origin_nodes.size();
+  VLOG(3) << "origin nodes count: " << origin_nodes.size();
   ir::Graph& result = *graph;
 
   // 1. record op nodes of different roles
@@ -137,8 +137,8 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
             "%s.repeat.%d", repeated_op.Input("Variance")[0], i);
         bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]);
         bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]);
-        VLOG(30) << "renaming " << repeated_op.Input("Mean")[0] << " to "
-                 << new_mean_name;
+        VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to "
+                << new_mean_name;
         repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name);
         repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name);
         repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0],
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 50d9113088..7a88cb2b68 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_info.h"
 
 namespace paddle {
 namespace framework {
@@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[];
 const char Node::kControlDepVarName[] = "__control_var";
 #endif
 
-std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
+std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
                                         Node::Type type) {
   return std::unique_ptr<Node>(new Node(name, type));
 }
+
+bool Node::RuntimeHasAttr(const std::string &name) const {
+  if (Op()->HasAttr(name)) {
+    return true;
+  } else {
+    auto &op_info = OpInfoMap::Instance();
+    auto op_type = Op()->Type();
+    if (op_info.Has(op_type)) {
+      auto op_info_ptr = op_info.Get(op_type);
+      if (op_info_ptr.HasOpProtoAndChecker()) {
+        const proto::OpProto &proto = op_info_ptr.Proto();
+        for (int i = 0; i != proto.attrs_size(); ++i) {
+          const proto::OpProto::Attr &attr = proto.attrs(i);
+          if (attr.name() == name) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index d2a393b3f1..1044a96430 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -108,6 +108,18 @@ class Node {
            Name().find(ir::Node::kControlDepVarName) != std::string::npos;
   }
 
+  // RuntimeHasAttr is different with HasAttr now.
+  // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
+  // thus, if stored program_desc_ are old which don't have an attr, a new
+  // library which adds the attr already will fail on this function.
+  // Details:
+  // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
+  // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
+  // problem.
+  // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
+  // RuntimeHasAttr.
+  bool RuntimeHasAttr(const std::string& name) const;
+
   std::vector<Node*> inputs;
   std::vector<Node*> outputs;
 
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 615b539695..a3559247db 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -76,7 +76,7 @@ class Pass {
                    attr_name);
     attrs_[attr_name] = attr;
     attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(30) << "deleting " << attr_name;
+      VLOG(3) << "deleting " << attr_name;
       delete attr;
     };
   }
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
index b7687d61de..012e68036c 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -196,7 +196,7 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
 
   detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
                             Graph* graph) {
-    VLOG(40) << "get one concat pattern";
+    VLOG(4) << "get one concat pattern";
     // fc
     GET_NODE(fc_w, detector.pattern());
     GET_NODE(fc_bias, detector.pattern());
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 015b5e3c63..0a1f65d274 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -60,7 +60,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(40) << "handle SeqConv EltAdd Relu fuse";
+    VLOG(4) << "handle SeqConv EltAdd Relu fuse";
     GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);
diff --git a/paddle/fluid/framework/lod_rank_table.cc b/paddle/fluid/framework/lod_rank_table.cc
index 660ce2ec85..6bc795b642 100644
--- a/paddle/fluid/framework/lod_rank_table.cc
+++ b/paddle/fluid/framework/lod_rank_table.cc
@@ -31,7 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
     TableItem item;
     item.index = i;
     item.length = vec[i + 1] - vec[i];
-    VLOG(100) << "Add item to rank table " << item.index << " " << item.length;
+    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
     items_.emplace_back(item);
   }
   // NOTE(yuyang18):
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 669d08c70c..9b2eeaf59a 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -26,10 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 
-#if !defined(_WIN32)
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
-#endif  // _WIN32
 
 namespace paddle {
 namespace framework {
@@ -305,7 +303,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
-#if !defined(_WIN32)
 void WriteToRecordIO(recordio::Writer *writer,
                      const std::vector<LoDTensor> &tensor,
                      const platform::DeviceContext &dev_ctx) {
@@ -335,19 +332,7 @@ bool ReadFromRecordIO(recordio::Scanner *scanner,
 
   return true;
 }
-#else
-class Writer {};
-class Scanner {};
-void WriteToRecordIO(recordio::Writer *writer,
-                     const std::vector<LoDTensor> &tensor,
-                     const platform::DeviceContext &dev_ctx) {}
-bool ReadFromRecordIO(recordio::Scanner *scanner,
-                      const platform::DeviceContext &dev_ctx,
-                      std::vector<LoDTensor> *result_ptr) {
-  PADDLE_ENFORCE("windows didn't supported recordio!.");
-  return true;
-}
-#endif  // _WIN32
+
 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
     const std::vector<platform::Place> places) const {
   check_memory_size();
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index cbf5fd04d7..cd50aaa260 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -274,7 +274,6 @@ TEST(LoD, ConvertToOffsetBasedLoD) {
   EXPECT_EQ(offset_lod, expected);
 }
 
-#if !defined(_WIN32)
 template <typename T>
 static void TestRecordIO() {
   LoDTensor tensor;
@@ -321,7 +320,6 @@ TEST(LoDTensor, RecordIO) {
   TestRecordIO<float>();
   TestRecordIO<double>();
 }
-#endif  // !defined(_WIN32)
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc
index 0330cae377..0599c8d384 100644
--- a/paddle/fluid/framework/mixed_vector_test.cc
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -51,7 +51,7 @@ TEST(mixed_vector, InitWithCount) {
 TEST(mixed_vector, ForEach) {
   vec<int> tmp;
   for (auto& v : tmp) {
-    VLOG(30) << v;
+    VLOG(3) << v;
   }
 }
 
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index e829563952..f1642bc0d2 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -21,42 +21,11 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace framework {
-
-// These code can be shared with Executor.
-static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope *>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
-        var_type);
-  }
-}
-
 void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
                             int block_id, bool with_feed_fetch_ops) {
   if (!scope) {
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
index 8177436d0b..e22c290377 100644
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -15,23 +15,105 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <functional>
+#include <vector>
 
 #include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
 
 #include "ngraph/ngraph.hpp"
 
 namespace paddle {
 namespace framework {
 
+static std::shared_ptr<ngraph::Node> GetNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const VariableNameMap& var_map,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = var_map.at(prm);
+  PADDLE_ENFORCE_EQ(var_names.size(), 1,
+                    "op %s prm %s expects one associated var", op->Type(), prm);
+  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
+    return (*ngb_node_map)[var_names[0]];
+  } else {
+    return nullptr;
+  }
+}
+
+static std::shared_ptr<ngraph::Node> GetInputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Inputs(), ngb_node_map);
+}
+
+static std::shared_ptr<ngraph::Node> GetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Outputs(), ngb_node_map);
+}
+
+static void SetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<ngraph::Node> node,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = op->Outputs().at(prm);
+  if (var_names.size() == 1) {
+    (*ngb_node_map)[var_names[0]] = node;
+  } else if (var_names.size() == 0) {
+    (*ngb_node_map)[""] = node;
+  } else {
+    PADDLE_THROW("prm %s has more than 1 var_names.", prm);
+  }
+}
+
+static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
+                      const std::string prm) {
+  auto& outputs = op->Outputs();
+  if (outputs.find(prm) == outputs.end()) return false;
+  return outputs.at(prm).size() > 0;
+}
+
+template <typename T>
+static void BuildBinaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = GetInputNode(op, "X", ngb_node_map);
+  auto y = GetInputNode(op, "Y", ngb_node_map);
+  auto out = std::make_shared<T>(x, y);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+template <typename T>
+static void BuildUnaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto input = GetInputNode(op, "X", ngb_node_map);
+  auto out = std::make_shared<T>(input);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
 std::map<std::string,
          std::function<void(const std::shared_ptr<OperatorBase>&,
                             std::shared_ptr<std::unordered_map<
                                 std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {};
+    NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
+                                 {"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
 
-void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
   auto& op_type = op->Type();
-  NG_NODE_MAP[op_type](op, ngb_node_map);
+  NG_NODE_MAP[op_type](op, ngb_node_map_);
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h
index 55bf0d21f3..9ed6b95109 100644
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -20,16 +20,14 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <unordered_map>
-#include <vector>
 
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#include "ngraph/ngraph.hpp"
+#include "ngraph/node.hpp"
 
 namespace paddle {
 namespace framework {
 
+class OperatorBase;
+
 class NgraphBridge {
  public:
   static std::map<
@@ -43,14 +41,14 @@ class NgraphBridge {
       std::shared_ptr<
           std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
           var_node_map)
-      : ngb_node_map(var_node_map) {}
+      : ngb_node_map_(var_node_map) {}
 
-  void build_graph(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
 
  private:
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      ngb_node_map;
+      ngb_node_map_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index d967b2780c..3fea753f06 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -19,14 +19,29 @@ limitations under the License. */
 #include <map>
 
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/ngraph_operator.h"
-#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/framework/var_type.h"
 
+#include "ngraph/ngraph.hpp"
+
 namespace paddle {
 namespace framework {
 
+static ngraph::Shape Ddim2Shape(const DDim& dims) {
+  ngraph::Shape sp;
+  for (int i = 0; i < dims.size(); ++i) {
+    int k = dims[i];
+    k = k == 0 ? 1 : k;
+    sp.push_back(k);
+  }
+  return sp;
+}
+
 static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
     {proto::VarType::FP32, ngraph::element::f32},
     {proto::VarType::FP64, ngraph::element::f64},
@@ -42,6 +57,7 @@ typedef enum {                /* nGraph support state on ops          */
                PARTIAL_TEST   /* Support partial list of ops for test */
 } op_state;
 
+// perform graph build through bridge and execute computation
 class NgraphOperator {
  public:
   explicit NgraphOperator(const Scope& scope, const platform::Place& place,
@@ -59,13 +75,23 @@ class NgraphOperator {
         persistables_(persist),
         fetches_(fetches),
         post_op_inputs_(post_op_inputs),
-        ng_op_state_(ng_op_state) {}
+        ng_op_state_(ng_op_state) {
+    var_in_node_map_ = std::make_shared<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+    var_node_map_ = std::make_shared<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+    BuildNgIO();
+
+    GetNgFunction();
+  }
 
   void Run(const Scope& scope, const platform::Place& place) const;
 
  private:
   static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache;
+      func_cache_;
   const Scope& scope_;
   const platform::Place& place_;
   std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
@@ -74,6 +100,35 @@ class NgraphOperator {
   std::unordered_set<std::string> fetches_;
   std::unordered_set<std::string> post_op_inputs_;
   op_state ng_op_state_;
+
+  // ngraph backend eg. CPU
+  static std::shared_ptr<ngraph::runtime::Backend> backend_;
+  // ngraph function to call and execute
+  std::shared_ptr<ngraph::Function> ngraph_function_;
+  // var_name of inputs
+  std::vector<std::string> var_in_;
+  // var_name of outputs from  fetch in order
+  std::vector<std::string> var_out_;
+  // map input vars to nodes
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_in_node_map_;
+  // map each var name with a ngraph node
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_node_map_;
+  // cache key to check if function is cached
+  std::shared_ptr<std::string> GetCacheKey();
+  // get ngraph input and define ngraph input parameters
+  void GetNgInputShape(std::shared_ptr<OperatorBase> op);
+  // Call ngraph bridge to map ops
+  void BuildNgNodes();
+  // get the ngraph input and output var list
+  void BuildNgIO();
+  // build ngraph function call
+  void BuildNgFunction();
+  // Check cache for ngraph function or otherwise build the function
+  void GetNgFunction();
 };
 
 std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
@@ -86,7 +141,7 @@ FusedOperator::FusedOpIntervals(
   }
   size_t size = ops->size();
   size_t left = 0;
-  while (left < size && ops.at(left)->Type() != kFeedOpType) {
+  while (left < size && ops->at(left)->Type() != kFeedOpType) {
     ++left;
   }
   if (left == size) {
@@ -116,7 +171,7 @@ FusedOperator::FusedOpIntervals(
       size_t start = pivot, end = start;
       while (pivot < right &&
              (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
-                  ops.at(pivot)->Type()) !=
+                  ops->at(pivot)->Type()) !=
               paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
         ++pivot;
         ++end;
@@ -136,7 +191,9 @@ FusedOperator::FusedOperator(
     std::vector<std::unique_ptr<OperatorBase>>::iterator end,
     const std::string& type, const VariableNameMap& inputs,
     const VariableNameMap& outputs, const AttributeMap& attrs)
-    : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
+    : OperatorBase(type, inputs, outputs, attrs),
+      pdesc_(prog),
+      block_(block_id) {
   for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
        it != end; ++it) {
     fused_ops_.push_back(std::move(*it));
@@ -152,7 +209,7 @@ FusedOperator::FusedOperator(
   }
 
   if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
-    is_complete = true;
+    is_full_ = true;
   }
 
   Process();
@@ -205,7 +262,7 @@ void FusedOperator::RunImpl(const Scope& scope,
     }
   }
 
-  if (is_full) {
+  if (is_full_) {
     ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
   }
 
@@ -215,6 +272,280 @@ void FusedOperator::RunImpl(const Scope& scope,
   ngraph_op.Run(scope, place);
 }
 
+std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+    NgraphOperator::func_cache_ = {};
+
+std::shared_ptr<ngraph::runtime::Backend> NgraphOperator::backend_ =
+    ngraph::runtime::Backend::create("CPU");
+
+void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
+  op->RuntimeInferShape(scope_, place_);
+  for (auto& var_name_item : op->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && var->IsType<LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto sp = Ddim2Shape(tensor_pd->dims());
+        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
+            var_in_.end()) {
+          if (var_node_map_->find(var_name) == var_node_map_->end()) {
+            auto ng_type = var_type_map_.at(var_name);
+            auto prm =
+                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
+            (*var_node_map_)[var_name] = prm;
+            (*var_in_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+}
+
+void NgraphOperator::BuildNgNodes() {
+  for (auto& var_name : var_out_) {
+    if (var_node_map_->find(var_name) == var_node_map_->end()) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && var->IsType<LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto& ddim = tensor_pd->dims();
+        auto ng_shape = Ddim2Shape(ddim);
+        auto ng_type = var_type_map_.at(var_name);
+        auto prm =
+            std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
+        (*var_node_map_)[var_name] = prm;
+      }
+    }
+  }
+
+  paddle::framework::NgraphBridge ngb(var_node_map_);
+  for (auto& op : fused_ops_) {
+    ngb.BuildNgNode(op);
+  }
+}
+
+void NgraphOperator::BuildNgIO() {
+  std::unordered_set<std::string> inputs;
+  std::unordered_set<std::string> outputs;
+
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        inputs.insert(var_name);
+        const bool is_output = outputs.find(var_name) != outputs.end();
+        if (!is_output &&
+            std::find(var_in_.begin(), var_in_.end(), var_name) ==
+                var_in_.end()) {
+          // fill var_in here to keep lhs and rhs order
+          var_in_.push_back(var_name);
+        }
+      }
+    }
+
+    if (op->Type() != "fill_constant") {
+      GetNgInputShape(op);
+    }
+
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        outputs.insert(var_name);
+      }
+    }
+  }
+
+  // var_out.clear();
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        switch (ng_op_state_) {
+          case PARTIAL_TEST:
+            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case FULL_TEST:
+            if (fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case PARTIAL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case FULL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          default:
+            var_out_.push_back(var_name);
+        }
+      }
+    }
+  }
+}
+
+void NgraphOperator::BuildNgFunction() {
+  BuildNgNodes();
+  ngraph_function_ = nullptr;
+  ngraph::NodeVector func_outputs;
+  ngraph::op::ParameterVector func_inputs;
+
+  for (auto& vo : var_out_) {
+    func_outputs.push_back(var_node_map_->at(vo));
+  }
+
+  for (auto& vi : var_in_) {
+    std::shared_ptr<ngraph::op::Parameter> prm =
+        std::dynamic_pointer_cast<ngraph::op::Parameter>(
+            var_in_node_map_->at(vi));
+    func_inputs.push_back(prm);
+  }
+
+  ngraph_function_ =
+      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
+}
+
+std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
+  auto cache_key = std::make_shared<std::string>("");
+  *cache_key += std::to_string(fused_ops_.size());
+  for (auto& op : fused_ops_) {
+    *cache_key += op->Type();
+  }
+  for (auto& var_name : var_in_) {
+    auto shape = var_node_map_->at(var_name)->get_shape();
+    *cache_key += var_name;
+    *cache_key += var_type_map_.at(var_name).c_type_string();
+    for (size_t i = 0; i < shape.size(); ++i) {
+      *cache_key += std::to_string(shape.at(i));
+    }
+  }
+
+  for (auto& var_name : var_out_) {
+    auto* var = scope_.FindVar(var_name);
+    if (var && var->IsType<LoDTensor>()) {
+      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      auto& ddim = tensor_pd->dims();
+      for (int i = 0; i < ddim.size(); ++i) {
+        *cache_key += std::to_string(ddim[i]);
+      }
+    }
+  }
+  return cache_key;
+}
+
+void NgraphOperator::GetNgFunction() {
+  bool cache_on = true;
+  if (cache_on) {
+    std::string cache_key_val = *GetCacheKey();
+    if (func_cache_.find(cache_key_val) != func_cache_.end()) {
+      ngraph_function_ = func_cache_.at(cache_key_val);
+    } else {
+      BuildNgFunction();
+      func_cache_[cache_key_val] = ngraph_function_;
+    }
+  } else {
+    BuildNgFunction();
+  }
+}
+
+void NgraphOperator::Run(const Scope& scope,
+                         const platform::Place& place) const {
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
+
+  for (size_t i = 0; i < var_in_.size(); ++i) {
+    auto vi = var_in_.at(i);
+    auto sp = var_node_map_->at(vi)->get_shape();
+    std::shared_ptr<ngraph::runtime::Tensor> ti;
+    auto* var = scope.FindVar(vi);
+    if (var && var->IsType<LoDTensor>()) {
+      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
+                     "Ensure ngraph tensor layout align with paddle tensor");
+      if (tensor_pd->type().hash_code() ==
+          typeid(float).hash_code()) {  // NOLINT
+        const float* arr = tensor_pd->data<float>();
+        ti = backend_->create_tensor(ngraph::element::f32, sp,
+                                     const_cast<float*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(int).hash_code()) {  // NOLINT
+        const int* arr = tensor_pd->data<int>();
+        ti = backend_->create_tensor(ngraph::element::i32, sp,
+                                     const_cast<int*>(arr));
+      } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) {
+        const int64_t* arr = tensor_pd->data<int64_t>();
+        ti = backend_->create_tensor(ngraph::element::i64, sp,
+                                     const_cast<int64_t*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(double).hash_code()) {  // NOLINT
+        const double* arr = tensor_pd->data<double>();
+        ti = backend_->create_tensor(ngraph::element::f64, sp,
+                                     const_cast<double*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(bool).hash_code()) {  // NOLINT
+        const bool* arr = tensor_pd->data<bool>();
+        ti = backend_->create_tensor(ngraph::element::boolean, sp,
+                                     const_cast<bool*>(arr));
+      } else {
+        PADDLE_THROW("Data type not handling for var %s", vi);
+      }
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+    }
+    bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
+                       ? true
+                       : false;
+    bool is_persistable =
+        (persistables_.find(vi) != persistables_.end()) ? true : false;
+    if (is_test && is_persistable) {
+      ti->set_stale(false);
+    }
+    t_in.push_back(ti);
+  }
+
+  for (size_t i = 0; i < var_out_.size(); ++i) {
+    auto var_name = var_out_[i];
+    auto* var = scope.FindVar(var_name);
+    std::shared_ptr<ngraph::runtime::Tensor> to;
+    if (var && var->IsType<LoDTensor>()) {
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+      auto dd = tensor_pd->dims();
+      ngraph::Shape sp = Ddim2Shape(dd);
+      auto ng_type = var_type_map_.at(var_name);
+      if (ng_type == ngraph::element::f32) {
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
+        to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i64) {
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
+        to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::f64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
+        to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::boolean) {
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
+        to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Data type not handled in for var %s", var_name);
+      }
+      t_out.push_back(to);
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
+    }
+  }
+
+  backend_->call(ngraph_function_, t_out, t_in);
+}  // NgraphOperator::RunImpl
 }  // namespace framework
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h
index 0f655cef1d..3ca023e111 100644
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
@@ -17,24 +17,19 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NGRAPH
 
 #include <algorithm>
-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/variant.h"
 
-#include "ngraph/ngraph.hpp"
+#include "ngraph/type/element_type.hpp"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 362cda3f23..e8ecd90502 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -81,13 +81,35 @@ class CompileTimeInferShapeContext : public InferShapeContext {
                    "The %s[%d] is @EMPTY@", out, j);
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
-    if (in_var->GetType() != proto::VarType::LOD_TENSOR) {
-      VLOG(30) << "input " << in << " is not LodTensor";
+    if (in_var->GetType() != proto::VarType::LOD_TENSOR &&
+        in_var->GetType() != proto::VarType::LOD_TENSOR_ARRAY) {
+      VLOG(3) << "input " << in << " is not LodTensor or LodTensorArray.";
       return;
     }
     out_var->SetLoDLevel(in_var->GetLoDLevel());
   }
 
+  void DecreaseLoDLevel(const std::string &in, const std::string &out,
+                        size_t i = 0, size_t j = 0) const override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", in, i);
+    PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", out, j);
+    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
+    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
+    PADDLE_ENFORCE(out_var->GetType() == proto::VarType::LOD_TENSOR_ARRAY ||
+                       out_var->GetType() == proto::VarType::LOD_TENSOR,
+                   "The input %s should be LodTensorArray or LodTensor.",
+                   out_var->Name());
+    PADDLE_ENFORCE(in_var->GetType() == proto::VarType::LOD_TENSOR,
+                   "The input %s should be LodTensor.", in_var->Name());
+    if (in_var->GetLoDLevel() > 0) {
+      out_var->SetLoDLevel(in_var->GetLoDLevel() - 1);
+    }
+  }
+
   bool IsRuntime() const override;
 
  protected:
@@ -241,38 +263,38 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
     const proto::OpProto::Attr &attr = GetProtoAttr(name);
     switch (attr.type()) {
       case proto::AttrType::BOOLEANS: {
-        VLOG(110) << "SetAttr: " << Type() << ", " << name
-                  << " from INTS to BOOLEANS";
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BOOLEANS";
         this->attrs_[name] = std::vector<bool>();
         break;
       }
       case proto::AttrType::INTS: {
-        VLOG(110) << "SetAttr: " << Type() << ", " << name
-                  << " from INTS to INTS";
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to INTS";
         this->attrs_[name] = std::vector<int>();
         break;
       }
       case proto::AttrType::LONGS: {
-        VLOG(110) << "SetAttr: " << Type() << ", " << name
-                  << " from LONGS to LONGS";
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from LONGS to LONGS";
         this->attrs_[name] = std::vector<int64_t>();
         break;
       }
       case proto::AttrType::FLOATS: {
-        VLOG(110) << "SetAttr: " << Type() << ", " << name
-                  << " from INTS to FLOATS";
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to FLOATS";
         this->attrs_[name] = std::vector<float>();
         break;
       }
       case proto::AttrType::STRINGS: {
-        VLOG(110) << "SetAttr: " << Type() << ", " << name
-                  << " from INTS to STRINGS";
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to STRINGS";
         this->attrs_[name] = std::vector<std::string>();
         break;
       }
       case proto::AttrType::BLOCKS: {
-        VLOG(110) << "SetAttr: " << Type() << ", " << name
-                  << " from INTS to BLOCKS";
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BLOCKS";
         this->SetBlocksAttr(name, std::vector<BlockDesc *>());
         return;
       }
@@ -505,13 +527,13 @@ void OpDesc::CheckAttrs() {
 }
 
 void OpDesc::InferShape(const BlockDesc &block) const {
-  VLOG(30) << "CompileTime infer shape on " << Type();
+  VLOG(3) << "CompileTime infer shape on " << Type();
   InitInferShapeFuncs();
   auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
   PADDLE_ENFORCE(static_cast<bool>(infer_shape),
                  "%s's infer_shape has not been registered", this->Type());
   CompileTimeInferShapeContext ctx(*this, block);
-  if (VLOG_IS_ON(100)) {
+  if (VLOG_IS_ON(10)) {
     std::ostringstream sout;
     auto inames = this->InputArgumentNames();
     sout << " From [";
@@ -522,7 +544,7 @@ void OpDesc::InferShape(const BlockDesc &block) const {
     std::copy(onames.begin(), onames.end(),
               std::ostream_iterator<std::string>(sout, ", "));
     sout << "]";
-    VLOG(100) << sout.str();
+    VLOG(10) << sout.str();
   }
   infer_shape(&ctx);
 }
@@ -613,7 +635,7 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
     auto shape = var->GetShape();
     res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
   } catch (...) {
-    VLOG(50) << "GetDim of variable " << name << " error";
+    VLOG(5) << "GetDim of variable " << name << " error";
     std::rethrow_exception(std::current_exception());
   }
   return res;
@@ -630,7 +652,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
       res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
     }
   } catch (...) {
-    VLOG(50) << "GetRepeatedDim of variable " << name << " error.";
+    VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
     std::rethrow_exception(std::current_exception());
   }
   return res;
diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc
index 4a841bae83..bfc411ca2c 100644
--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@@ -46,9 +46,9 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
 
 std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
     const proto::OpDesc& op_desc) {
-  VLOG(10) << "CreateOp directly from OpDesc is deprecated. It should only be"
-              "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
-              "instead.";
+  VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
+             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
+             "instead.";
   VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
   VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
   AttributeMap attrs;
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 0084573cd0..c6f3254e9f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -139,7 +139,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(40) << place << " " << DebugStringEx(&scope);
+  VLOG(4) << place << " " << DebugStringEx(&scope);
   if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
     PADDLE_THROW("Cannot run operator on place %s", place);
@@ -149,20 +149,17 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
   }
 
-// The profile has a process-wide mutex, results in serious performance issue
-// in concurrency scenerio. Here use an `if` to fix this issue.
-// Please not remove the `if`, ask @Superjomn if there are any concern.
-#ifndef _WIN32
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
   if (platform::IsProfileEnabled()) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::RecordEvent record_event(Type(), pool.Get(place));
     RunImpl(scope, place);
-  } else
-#endif
-  {
+  } else {
     RunImpl(scope, place);
   }
-  VLOG(30) << place << " " << DebugStringEx(&scope);
+  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -626,6 +623,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
       out_tensor->set_layout(in_tensor.layout());
   }
 
+  void DecreaseLoDLevel(const std::string& in, const std::string& out,
+                        size_t i = 0, size_t j = 0) const override {
+    PADDLE_THROW("DecreaseLoDLevel is only used in compile time.");
+  }
+
   bool IsRuntime() const override { return true; }
 
  protected:
@@ -693,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
                  "Tensor %s contains NAN", name);
 }
 
+void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
+                                           const platform::Place& place) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
@@ -719,14 +727,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   auto expected_kernel_key =
       this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
-  VLOG(30) << "expected_kernel_key:" << expected_kernel_key;
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
   // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
   if (kernel_iter == kernels.end() &&
       expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-    VLOG(30) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
     expected_kernel_key.library_type_ = LibraryType::kPlain;
     expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
     kernel_iter = kernels.find(expected_kernel_key);
@@ -778,8 +786,7 @@ void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& scope, const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {
   for (auto& var_name : inplace_vars) {
-    VLOG(30) << "share inplace var " + var_name +
-                    " back to it's original scope";
+    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
     auto* original_tensor =
         GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name));
     auto* var = transfer_scope.FindVar(var_name);
@@ -820,8 +827,8 @@ Scope* OperatorWithKernel::TryTransferData(
         transfered_inplace_vars->emplace_back(var_name);
       }
 
-      VLOG(30) << "Transform Variable " << var_name << " from "
-               << kernel_type_for_var << " to " << expected_kernel_key;
+      VLOG(3) << "Transform Variable " << var_name << " from "
+              << kernel_type_for_var << " to " << expected_kernel_key;
 
       // In the inference scenerio, the scopes will be reused across the
       // batches, so the `new_scope` here will result in GPU memroy explosion
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index bfdfdc56b3..0a6a28a5bc 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -71,7 +71,7 @@ class OperatorBase;
 class ExecutionContext;
 
 /**
- * OperatorBase has the basic element that Net will call to do computation.
+ * OperatorBase has the basic elements that Net will call to do computation.
  * Only CreateOperator from OpRegistry will new Operator directly. User
  * should always construct a proto message OpDesc and call
  * OpRegistry::CreateOp(op_desc) to get an Operator instance.
@@ -128,6 +128,8 @@ class OperatorBase {
   virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
 
   void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
+  virtual void RuntimeInferShape(const Scope& scope,
+                                 const platform::Place& place) const {}
 
  protected:
   std::string type_;
@@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
     OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
   }
 
+  void RuntimeInferShape(const Scope& scope,
+                         const platform::Place& place) const override;
+
  protected:
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
   virtual OpKernelType GetKernelTypeForVar(
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 2c6e337568..b98408ee77 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -54,7 +54,7 @@ class ParallelExecutorPrivate {
   Scope *global_scope_;  // not owned
   std::unique_ptr<details::SSAGraphExecutor> executor_;
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
   bool own_local_scope_;
@@ -104,7 +104,7 @@ ParallelExecutor::ParallelExecutor(
 
   if (member_->use_cuda_) {
 // Bcast Parameters to all GPUs
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
     ncclUniqueId *nccl_id = nullptr;
     if (nccl_id_var != nullptr) {
@@ -124,7 +124,7 @@ ParallelExecutor::ParallelExecutor(
 
 // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
       main_program, member_->places_, loss_var_name, params,
       member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
@@ -208,12 +208,12 @@ void ParallelExecutor::BCastParamsToDevices(
 
     auto &main_tensor = main_var->Get<LoDTensor>();
     if (!main_tensor.IsInitialized()) {
-      VLOG(30) << "one in var not inited, return!";
+      VLOG(3) << "one in var not inited, return!";
       continue;
     }
     auto &dims = main_tensor.dims();
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       std::vector<void *> buffers;
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 26cb7d51a8..0d261dd7cc 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -162,7 +162,7 @@ Variable* Scope::VarInternal(const std::string& name) {
 
   v = new Variable();
   vars_[name].reset(v);
-  VLOG(30) << "Create variable " << name;
+  VLOG(3) << "Create variable " << name;
   v->name_ = &(vars_.find(name)->first);
   return v;
 }
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index f4f2b769d5..62a30815d4 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -206,7 +206,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
   PADDLE_ENFORCE(value->IsInitialized(),
                  "The value tensor should be initialized.");
   if (ids.numel() == 0) {
-    VLOG(30) << "keys is empty, please check data!";
+    VLOG(3) << "keys is empty, please check data!";
   } else {
     int64_t value_width = value_->numel() / value_->dims()[0];
     PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 55ca02038e..44384082db 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -120,8 +120,22 @@ class SelectedRows {
    */
   int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
 
-  void SyncIndex();
+  /*
+   * @brief Get the index of the key from id_to_index_ map.
+   */
+  inline int64_t GetIndexFromId(int64_t key) {
+    auto iter = id_to_index_.find(key);
+    if (iter == id_to_index_.end()) {
+      return -1;
+    } else {
+      return iter->second;
+    }
+  }
 
+  void SyncIndex();
+  /*
+   * @brief Get complete Dims before
+   */
   DDim GetCompleteDims() const {
     std::vector<int64_t> dims = vectorize(value_->dims());
     dims[0] = height_;
@@ -133,9 +147,10 @@ class SelectedRows {
   // SelectedRows are simply concated when adding together. Until a
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
-  std::unordered_map<int64_t, int64_t> id_to_index_;
+  std::unordered_map<int64_t, int64_t>
+      id_to_index_;  // should not be used when rows_ has duplicate member
   std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;
+  int64_t height_;  // height indicates the underline tensor's height
   std::unique_ptr<RWLock> rwlock_{nullptr};
 };
 
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 280bc19dce..d73cca121e 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -62,6 +62,9 @@ class InferShapeContext {
   virtual void ShareLoD(const std::string &in, const std::string &out,
                         size_t i = 0, size_t j = 0) const = 0;
 
+  virtual void DecreaseLoDLevel(const std::string &in, const std::string &out,
+                                size_t i = 0, size_t j = 0) const = 0;
+
   virtual bool IsRuntime() const = 0;
 
   std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 8d8f07a1f5..ca1e01c89f 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -22,8 +22,8 @@ namespace framework {
 
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst) {
-  VLOG(30) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
-           << dst_place;
+  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
   src.check_memory_size();
 
   dst->Resize(src.dims());
@@ -37,8 +37,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
-      VLOG(30) << "Skip copy the same data async from " << src_place << " to "
-               << dst_place;
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
       return;
     }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@@ -77,8 +77,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
       if (src_ptr == dst_ptr) {
-        VLOG(30) << "Skip copy the same data async from " << src_place << " to "
-                 << dst_place;
+        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+                << dst_place;
         return;
       }
       memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
@@ -114,8 +114,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst) {
-  VLOG(30) << "TensorCopySync " << src.dims() << " from " << src.place()
-           << " to " << dst_place;
+  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
+          << " to " << dst_place;
   src.check_memory_size();
   dst->Resize(src.dims());
   dst->set_layout(src.layout());
@@ -125,8 +125,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
-      VLOG(30) << "Skip copy the same data from " << src_place << " to "
-               << dst_place;
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
+              << dst_place;
       return;
     }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@@ -146,8 +146,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
     if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
-      VLOG(30) << "Skip copy the same data from " << src_place << " to "
-               << dst_place;
+      VLOG(3) << "Skip copy the same data from " << src_place << " to "
+              << dst_place;
       return;
     }
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 2dab4e793e..fcec955360 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -39,7 +39,7 @@ void ThreadPool::Init() {
     int num_threads = std::thread::hardware_concurrency();
     if (FLAGS_dist_threadpool_size > 0) {
       num_threads = FLAGS_dist_threadpool_size;
-      VLOG(10) << "set dist_threadpool_size to " << num_threads;
+      VLOG(1) << "set dist_threadpool_size to " << num_threads;
     }
     PADDLE_ENFORCE_GT(num_threads, 0);
     threadpool_.reset(new ThreadPool(num_threads));
diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc
index f6219a1417..e52a8317e2 100644
--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
@@ -17,28 +17,16 @@
 namespace paddle {
 namespace framework {
 
-// Holds all the transfer scope across the process.
 std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
-  typedef std::unordered_map<size_t, Scope*> map_t;
-  thread_local std::unique_ptr<map_t> x(new map_t);
+  thread_local auto* x = new std::unordered_map<size_t, Scope*>;
   return *x;
 }
 
-// Holds all the transfer scope for this thread.
 std::unordered_set<Scope*>& global_transfer_scope_cache() {
-  typedef std::unordered_set<Scope*> set_t;
-  thread_local std::unique_ptr<set_t> x(new set_t);
+  thread_local auto* x = new std::unordered_set<Scope*>;
   return *x;
 }
 
-// Try to create a transfer scope. If one cached scope has match the
-// requirement, just return that one.
-// Inputs:
-// @type0: the source kernel type.
-// @type1: the target kernel type.
-// @scope: the execution scope of this op.
-// Returns: A scope used to hold the transfer data across the different kernel
-// type.
 Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
                               const Scope* scope) {
   Scope* new_scope{nullptr};
@@ -58,5 +46,27 @@ Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,
   return new_scope;
 }
 
+void RemoveKidsFromTransferScopeCache(Scope* scope) {
+  auto it = global_transfer_scope_cache().find(scope);
+  if (it != global_transfer_scope_cache().end()) {
+    global_transfer_scope_cache().erase(it);
+  }
+  for (auto* s : scope->kids()) {
+    auto it = global_transfer_scope_cache().find(s);
+    if (it != global_transfer_scope_cache().end()) {
+      global_transfer_scope_cache().erase(it);
+    }
+  }
+
+  // remove global transfer data cache
+  auto& cache = global_transfer_data_cache();
+  for (auto it = cache.begin(); it != cache.end();) {
+    if (it->second == scope)
+      it = cache.erase(it);
+    else
+      it++;
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index 29ef459b45..7e3f002b53 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -61,10 +61,10 @@ size_t VarDesc::GetTensorDescNum() const {
 void VarDesc::SetShapes(
     const std::vector<std::vector<int64_t>> &multiple_dims) {
   if (multiple_dims.size() != GetTensorDescNum()) {
-    VLOG(30) << "WARNING: The number of given shapes(" << multiple_dims.size()
-             << ") doesn't match the existing tensor number("
-             << GetTensorDescNum()
-             << "). The Reader is going to be reinitialized.";
+    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
     SetTensorDescNum(multiple_dims.size());
   }
   std::vector<proto::VarType::TensorDesc *> tensors = mutable_tensor_descs();
@@ -94,11 +94,11 @@ void VarDesc::SetDataType(proto::VarType::Type data_type) {
 void VarDesc::SetDataTypes(
     const std::vector<proto::VarType::Type> &multiple_data_type) {
   if (multiple_data_type.size() != GetTensorDescNum()) {
-    VLOG(30) << "WARNING: The number of given data types("
-             << multiple_data_type.size()
-             << ") doesn't match the existing tensor number("
-             << GetTensorDescNum()
-             << "). The Reader is going to be reinitialized.";
+    VLOG(3) << "WARNING: The number of given data types("
+            << multiple_data_type.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
     SetTensorDescNum(multiple_data_type.size());
   }
   std::vector<proto::VarType::TensorDesc *> tensor_descs =
@@ -139,11 +139,11 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
 
 void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
   if (multiple_lod_level.size() != GetTensorDescNum()) {
-    VLOG(30) << "WARNING: The number of given lod_levels("
-             << multiple_lod_level.size()
-             << ") doesn't match the existing tensor number("
-             << GetTensorDescNum()
-             << "). The Reader is going to be reinitialized.";
+    VLOG(3) << "WARNING: The number of given lod_levels("
+            << multiple_lod_level.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
     SetTensorDescNum(multiple_lod_level.size());
   }
   switch (desc_.type().type()) {
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
new file mode 100644
index 0000000000..fc4525549c
--- /dev/null
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/variable_helper.h"
+
+#include <vector>
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
+  if (var_type == proto::VarType::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarType::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope*>>();
+  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarType::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarType::READER) {
+    var->GetMutable<ReaderHolder>();
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
+        var_type);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h
new file mode 100644
index 0000000000..0e0c72c362
--- /dev/null
+++ b/paddle/fluid/framework/variable_helper.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/variable.h"
+namespace paddle {
+namespace framework {
+void InitializeVariable(Variable *var, proto::VarType::Type var_type);
+}
+}
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 4bd3f93ef7..27b6b80955 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -35,4 +35,5 @@ function(inference_analysis_test TARGET)
   endif()
 endfunction(inference_analysis_test)
 
-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS reset_tensor_array paddle_inference_api)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
+    EXTRA_DEPS reset_tensor_array paddle_inference_api)
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index 299f235a74..d5a972fab3 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -46,8 +46,6 @@ class AnalysisPass {
  protected:
   // User should implement these.
   virtual void RunImpl(Argument* argument) = 0;
-
-  Argument* argument_{nullptr};
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 84a0c3374c..cb88333d15 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace inference {
@@ -75,7 +76,8 @@ void TestWord2vecPrediction(const std::string& model_path) {
                      0.000932706};
   const size_t num_elements = outputs.front().data.length() / sizeof(float);
   // The outputs' buffers are in CPU memory.
-  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+  for (size_t i = 0; i < std::min(static_cast<size_t>(5UL), num_elements);
+       i++) {
     LOG(INFO) << "data: "
               << static_cast<float*>(outputs.front().data.data())[i];
     PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index a30c27b118..d3ea511d8f 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -1,6 +1,7 @@
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass)
+cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
 
 set(analysis_deps ${analysis_deps}
         ir_graph_build_pass
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
index 108cb6f74b..c3a2b3ca1d 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
 void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
   std::vector<std::string> passes({
       "ir_graph_build_pass", "ir_analysis_pass",
+      "ir_params_sync_among_devices_pass",
   });
   for (const auto &pass : passes) {
     VLOG(2) << "Run pass " << pass;
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index d5e0d90de1..740030c3a8 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
   // so that the parameters will on the same device, or they will keep copying
   // between difference devices.
   platform::Place place;
-  if (argument->use_gpu()) {
-    PADDLE_ENFORCE(argument->gpu_device_id_valid());
-    place = platform::CUDAPlace(argument->gpu_device_id());
-  } else {
-    place = platform::CPUPlace();
-  }
+  place = platform::CPUPlace();
 
   if (argument->model_dir_valid()) {
     auto program =
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
new file mode 100644
index 0000000000..8be2d3ac0b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
+  PADDLE_ENFORCE(argument->scope_valid());
+  PADDLE_ENFORCE(argument->use_gpu_valid());
+
+  platform::Place place;
+
+  // The parameters are on the cpu, therefore, synchronization is not necessary.
+  if (!argument->use_gpu()) return;
+
+  LOG(INFO) << "Sync params from CPU to GPU";
+
+  PADDLE_ENFORCE(argument->gpu_device_id_valid());
+  place = platform::CUDAPlace(argument->gpu_device_id());
+
+  auto *scope = argument->scope_ptr();
+  std::vector<std::string> all_vars = scope->LocalVarNames();
+
+  // We get all the vars from local_scope instead of the ProgramDesc.
+  // Because there exists the case that new parameter variables are not added to
+  // the program in the analysis pass.
+  for (auto &var_name : all_vars) {
+    auto *var = scope->FindLocalVar(var_name);
+    PADDLE_ENFORCE(var != nullptr);
+    if (var->IsType<framework::LoDTensor>() ||
+        var->IsType<framework::Tensor>()) {
+      auto *t = var->GetMutable<framework::LoDTensor>();
+
+      platform::CPUPlace cpu_place;
+      framework::LoDTensor temp_tensor;
+      temp_tensor.Resize(t->dims());
+      temp_tensor.mutable_data<float>(cpu_place);
+
+      // Copy the parameter data to a tmp tensor.
+      TensorCopySync(*t, cpu_place, &temp_tensor);
+      // Reallocation the space on GPU
+      t->mutable_data<float>(place);
+
+      // Copy parameter data to newly allocated GPU space.
+      TensorCopySync(temp_tensor, place, t);
+    }
+  }
+}
+
+std::string IrParamsSyncAmongDevicesPass::repr() const {
+  return "ir-params-sync-among-devices-pass";
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
new file mode 100644
index 0000000000..a95f460df6
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Sync parameter from CPU to GPU.
+ */
+class IrParamsSyncAmongDevicesPass : public AnalysisPass {
+ public:
+  void RunImpl(Argument *argument) override;
+  std::string repr() const override;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
index 2ef515f45f..9245e32cee 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -27,6 +28,9 @@ PassRegistry::PassRegistry() {
                   std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
   passes_.emplace("ir_analysis_compose_pass",
                   std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+  passes_.emplace(
+      "ir_params_sync_among_devices_pass",
+      std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 72ac534384..391330a7c0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -55,8 +55,7 @@ bool IsPersistable(const framework::VarDesc *var) {
 bool AnalysisPredictor::Init(
     const std::shared_ptr<framework::Scope> &parent_scope,
     const std::shared_ptr<framework::ProgramDesc> &program) {
-  VLOG(30) << "Predictor::init()";
-#if !defined(_WIN32)
+  VLOG(3) << "Predictor::init()";
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
     LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -64,7 +63,6 @@ bool AnalysisPredictor::Init(
                                            : platform::ProfilerState::kCPU;
     platform::EnableProfiler(tracking_device);
   }
-#endif
 
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
@@ -171,7 +169,7 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                             std::vector<PaddleTensor> *output_data,
                             int batch_size) {
-  VLOG(30) << "Predictor::predict";
+  VLOG(3) << "Predictor::predict";
   inference::Timer timer;
   timer.tic();
   // set feed variable
@@ -190,17 +188,21 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     LOG(ERROR) << "fail to get fetches";
     return false;
   }
-  VLOG(30) << "predict cost: " << timer.toc() << "ms";
-
-  // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
-  tensor_array_batch_cleaner_.ResetTensorArray();
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+
+  // All the containers in the scope will be hold in inference, but the
+  // operators assume that the container will be reset after each batch.
+  // Here is a bugfix, collect all the container variables, and reset then to a
+  // bool; the next time, the operator will call MutableData and construct a new
+  // container again, so that the container will be empty for each batch.
+  tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
   return true;
 }
 
 bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                 framework::Scope *scope) {
-  VLOG(30) << "Predictor::set_feed";
+  VLOG(3) << "Predictor::set_feed";
   if (inputs.size() != feeds_.size()) {
     LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
                << inputs.size();
@@ -277,7 +279,7 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
 
 bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                  framework::Scope *scope) {
-  VLOG(30) << "Predictor::get_fetch";
+  VLOG(3) << "Predictor::get_fetch";
   outputs->resize(fetchs_.size());
   for (size_t i = 0; i < fetchs_.size(); ++i) {
     int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
@@ -286,6 +288,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
         framework::GetFetchVariable(*scope, "fetch", idx);
     auto type = fetch.type();
     auto output = &(outputs->at(i));
+    output->name = fetchs_[idx]->Input("X")[0];
     if (type == typeid(float)) {
       GetFetchOne<float>(fetch, output);
       output->dtype = PaddleDType::FLOAT32;
@@ -340,7 +343,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
-  VLOG(30) << "create AnalysisConfig";
+  VLOG(3) << "create AnalysisConfig";
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
@@ -354,7 +357,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
       std::string flag = "--fraction_of_gpu_memory_to_use=" +
                          std::to_string(config.fraction_of_gpu_memory);
       flags.push_back(flag);
-      VLOG(30) << "set flag: " << flag;
+      VLOG(3) << "set flag: " << flag;
       framework::InitGflags(flags);
     }
   }
@@ -418,7 +421,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 bool AnalysisPredictor::ZeroCopyRun() {
   executor_->Run();
   // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
   tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
@@ -520,12 +523,10 @@ bool AnalysisPredictor::LoadParameters() {
 }
 
 AnalysisPredictor::~AnalysisPredictor() {
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     platform::DisableProfiler(platform::EventSortingKey::kTotal,
                               "./profile.log");
   }
-#endif
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index db57812bc3..12ecb7c15e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -109,7 +109,7 @@ class AnalysisPredictor : public PaddlePredictor {
   std::map<std::string, size_t> feed_names_;
   std::vector<framework::OpDesc *> fetchs_;
   // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
-  // concurrency problems, so cache them.
+  // concurrency problems, wrong results and memory leak, so cache them.
   std::vector<framework::LoDTensor> feed_tensors_;
   details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
 
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 0f88ad14b0..4c5b412a2c 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -64,7 +64,6 @@ void NativePaddlePredictor::PrepareFeedFetch() {
 bool NativePaddlePredictor::Init(
     std::shared_ptr<framework::Scope> parent_scope) {
   VLOG(3) << "Predictor::init()";
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
     LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -73,7 +72,6 @@ bool NativePaddlePredictor::Init(
                                            : platform::ProfilerState::kCPU;
     platform::EnableProfiler(tracking_device);
   }
-#endif
 
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
@@ -121,12 +119,10 @@ bool NativePaddlePredictor::Init(
 }
 
 NativePaddlePredictor::~NativePaddlePredictor() {
-#if !defined(_WIN32)
   if (FLAGS_profile) {
     platform::DisableProfiler(platform::EventSortingKey::kTotal,
                               "./profile.log");
   }
-#endif
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
@@ -156,11 +152,11 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     LOG(ERROR) << "fail to get fetches";
     return false;
   }
-  VLOG(30) << "predict cost: " << timer.toc() << "ms";
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
 
-  // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
-  tensor_array_batch_cleaner_.ResetTensorArray();
+  // For some other vector like containers not cleaned after each batch.
+  tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get());
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
   return true;
 }
 
@@ -189,8 +185,12 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                << inputs.size();
     return false;
   }
+
+  // Cache the inputs memory for better concurrency performance.
+  feed_tensors_.resize(inputs.size());
+
   for (size_t i = 0; i < inputs.size(); ++i) {
-    framework::LoDTensor input;
+    auto &input = feed_tensors_[i];
     framework::DDim ddim = framework::make_ddim(inputs[i].shape);
     void *input_ptr;
     if (inputs[i].dtype == PaddleDType::INT64) {
@@ -265,6 +265,7 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
         framework::GetFetchVariable(*scope, "fetch", idx);
     auto type = fetch.type();
     auto output = &(outputs->at(i));
+    output->name = fetchs_[idx]->Input("X")[0];
     if (type == typeid(float)) {
       GetFetchOne<float>(fetch, output);
       output->dtype = PaddleDType::FLOAT32;
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 9dfa48d501..c1fcd198cc 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -69,6 +69,9 @@ class NativePaddlePredictor : public PaddlePredictor {
   std::vector<framework::OpDesc *> feeds_;
   std::map<std::string, size_t> feed_names_;
   std::vector<framework::OpDesc *> fetchs_;
+  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
+  // concurrency problems, wrong results and memory leak, so cache them.
+  std::vector<framework::LoDTensor> feed_tensors_;
   // Do not use unique_ptr, use parent scope to delete
   framework::Scope *sub_scope_{nullptr};
   details::TensorArrayBatchCleaner tensor_array_batch_cleaner_;
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 8fb464c0f5..ec93729cd2 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
 link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
 
+if (NOT WIN32)
+    set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
+    if(EXISTS ${NGRAPH_PATH})
+        include(GNUInstallDirs)
+        include_directories("${NGRAPH_PATH}/include")
+        link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
+        set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
+    endif()
+endif()
+
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 
 if(WITH_MKL)
@@ -106,7 +116,7 @@ endif()
 if (NOT WIN32)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
-    ${MATH_LIB} ${MKLDNN_LIB}
+    ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
     glog gflags protobuf snappystream snappy z xxhash
     ${EXTERNAL_LIB})
 else()
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index ff718077c1..a94ccfa924 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -54,6 +54,9 @@ mkdir -p build
 cd build
 
 for WITH_STATIC_LIB in ON OFF; do
+# TODO(Superjomn) reopen this
+# something wrong with the TensorArray reset.
+:<<D
   # -----simple_on_word2vec-----
   rm -rf *
   cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -74,6 +77,7 @@ for WITH_STATIC_LIB in ON OFF; do
       fi
     done
   fi
+D
   # ---------vis_demo---------
   rm -rf *
   cmake .. -DPADDLE_LIB=${inference_install_dir} \
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index 0eb620ea51..61ecd7bce6 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -44,7 +44,7 @@ void Main() {
   config.fraction_of_gpu_memory = 0.1;  // set by yourself
   predictor = CreatePaddlePredictor(config);
 
-  VLOG(30) << "begin to process data";
+  VLOG(3) << "begin to process data";
   // Just a single batch of data.
   std::string line;
   std::ifstream file(FLAGS_data);
@@ -59,13 +59,13 @@ void Main() {
       PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
   input.dtype = PaddleDType::FLOAT32;
 
-  VLOG(30) << "run executor";
+  VLOG(3) << "run executor";
   std::vector<PaddleTensor> output;
   predictor->Run({input}, &output, 1);
 
-  VLOG(30) << "output.size " << output.size();
+  VLOG(3) << "output.size " << output.size();
   auto& tensor = output.front();
-  VLOG(30) << "output: " << SummaryTensor(tensor);
+  VLOG(3) << "output: " << SummaryTensor(tensor);
 
   // compare with reference result
   CheckOutput(FLAGS_refer, tensor);
diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
index 664b9d01c7..d70c6aea79 100644
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -47,7 +47,7 @@ static void split(const std::string& str, char sep,
 }
 
 Record ProcessALine(const std::string& line) {
-  VLOG(30) << "process a line";
+  VLOG(3) << "process a line";
   std::vector<std::string> columns;
   split(line, '\t', &columns);
   CHECK_EQ(columns.size(), 2UL)
@@ -65,8 +65,8 @@ Record ProcessALine(const std::string& line) {
   for (auto& s : shape_strs) {
     record.shape.push_back(std::stoi(s));
   }
-  VLOG(30) << "data size " << record.data.size();
-  VLOG(30) << "data shape size " << record.shape.size();
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
   return record;
 }
 
@@ -78,8 +78,8 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
   file.close();
 
   size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  VLOG(30) << "predictor output numel " << numel;
-  VLOG(30) << "reference output numel " << refer.data.size();
+  VLOG(3) << "predictor output numel " << numel;
+  VLOG(3) << "reference output numel " << refer.data.size();
   CHECK_EQ(numel, refer.data.size());
   switch (output.dtype) {
     case PaddleDType::INT64: {
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
index 244b0b567b..569a487328 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -26,7 +26,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
       // parameter.
       if (var_name == "feed" || var_name == "fetch") continue;
       if (var->Type() == typeid(framework::LoDTensorArray)) {
-        VLOG(40) << "collect " << var_name;
+        VLOG(4) << "collect " << var_name;
         arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
       }
     }
@@ -34,7 +34,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
       CollectTensorArrays(kid);
     }
 
-    VLOG(30) << "Collect " << arrays_.size() << " arrays";
+    VLOG(3) << "Collect " << arrays_.size() << " arrays";
     flag_ = false;
   }
 }
@@ -46,5 +46,28 @@ void TensorArrayBatchCleaner::ResetTensorArray() {
   }
 }
 
+void TensorArrayBatchCleaner::CollectNoTensorVars(framework::Scope *scope) {
+  if (no_tensor_flag_) {
+    for (auto &var_name : scope->LocalVarNames()) {
+      auto *var = scope->FindVar(var_name);
+      if (!var->IsInitialized()) continue;
+      if (!valid_types_.count(var->Type())) {
+        no_tensor_vars_.insert(var);
+      }
+    }
+
+    for (auto *kid : scope->kids()) {
+      CollectTensorArrays(kid);
+    }
+    no_tensor_flag_ = false;  // Only collect one time.
+  }
+}
+
+void TensorArrayBatchCleaner::ResetNoTensorVars() {
+  for (auto *var : no_tensor_vars_) {
+    var->Clear();
+  }
+}
+
 }  // namespace details
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index a39449ff0e..6a5ea64de6 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -14,9 +14,11 @@
 
 #pragma once
 
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace details {
@@ -24,13 +26,28 @@ namespace details {
 // Clean the TensorArray each batch to make the behavior the same with the
 // training phase.
 struct TensorArrayBatchCleaner {
+  TensorArrayBatchCleaner() {
+    valid_types_.insert(typeid(framework::Tensor));
+    valid_types_.insert(typeid(framework::LoDTensor));
+  }
+  // Collect the variables that are not Tensor or LoDTensor, and reset them to a
+  // bool(trick), because some of them are containers, and some operators just
+  // keep inserting new items without clearing the containers first; So the
+  // memory grow larger and larger in inference service deployed online.
+  void CollectNoTensorVars(framework::Scope *scope);
+  void ResetNoTensorVars();
+
   // Fix the tensor array not clear in the inference scenarios.
   void CollectTensorArrays(framework::Scope *scope);
   void ResetTensorArray();
 
  private:
   bool flag_{true};
+  bool no_tensor_flag_{true};
   std::vector<framework::LoDTensorArray *> arrays_;
+
+  std::unordered_set<std::type_index> valid_types_;
+  std::unordered_set<framework::Variable *> no_tensor_vars_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 6f9d663121..9a393a61c4 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,10 +15,6 @@
 #pragma once
 
 #include <glog/logging.h>
-#if !defined(_WIN32)
-#include <sys/time.h>
-#else
-#endif
 
 #include <algorithm>
 #include <chrono>  // NOLINT
@@ -28,6 +24,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 12e3a6f42e..825bee833b 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy {
 class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
-    // TODO(NHZlX) Problem with Data synchronization between GPU and CPU
-    // When running in GPU mode, the parameters are all on GPU. But the
-    // opearations of "conv_bn_fuse_pass" are on CPU.
     passes_.assign({
-        "infer_clean_graph_pass",
-        // "infer_clean_graph_pass", "conv_bn_fuse_pass",
+        "infer_clean_graph_pass", "conv_bn_fuse_pass",
     });
   }
 
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index bb749e8f8b..31f43bfdca 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -78,7 +78,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
 
   for (auto* var : global_block.AllVars()) {
     if (IsPersistable(var)) {
-      VLOG(30) << "persistable variable's name: " << var->Name();
+      VLOG(3) << "persistable variable's name: " << var->Name();
 
       framework::VarDesc* new_var = load_block->Var(var->Name());
       new_var->SetShape(var->GetShape());
@@ -121,7 +121,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                              const std::string& dirname) {
   std::string model_filename = dirname + "/__model__";
   std::string program_desc_str;
-  VLOG(30) << "loading model from " << model_filename;
+  VLOG(3) << "loading model from " << model_filename;
   ReadBinaryFile(model_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index d700e08590..343fd3f7c5 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -53,7 +53,7 @@ class Pool2dOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc &op,
                   const framework::Scope &scope, bool test_mode) override {
-    VLOG(40)
+    VLOG(4)
         << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 7dc88d9dd0..a07626a103 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -46,11 +46,18 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
 
-# DAM
+# normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
 
+# small DAM
+set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
+download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
+inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1)
+
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
 download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
index c4022225fd..da42688f29 100644
--- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gflags/gflags.h>
-#include <sys/time.h>
 #include <time.h>
 #include <algorithm>
 #include <fstream>
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index b369cba5c8..a3a6130db7 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -14,38 +14,54 @@
 
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
+DEFINE_int32(max_turn_num, 9,
+             "The max turn number: 1 for the small and 9 for the normal.");
+
 namespace paddle {
 namespace inference {
 using contrib::AnalysisConfig;
-#define MAX_TURN_NUM 9
-#define MAX_TURN_LEN 50
+
+constexpr int32_t kMaxTurnLen = 50;
+
 static std::vector<float> result_data;
 
 struct DataRecord {
-  std::vector<std::vector<int64_t>>
-      turns[MAX_TURN_NUM];  // turns data : MAX_TURN_NUM
-  std::vector<std::vector<float>>
-      turns_mask[MAX_TURN_NUM];                // turns mask data : MAX_TURN_NUM
-  std::vector<std::vector<int64_t>> response;  // response data : 1
+  std::vector<std::vector<int64_t>> *turns;
+  std::vector<std::vector<float>> *turns_mask;
+  std::vector<std::vector<int64_t>> response;     // response data : 1
   std::vector<std::vector<float>> response_mask;  // response mask data : 1
   size_t batch_iter{0};
   size_t batch_size{1};
   size_t num_samples;  // total number of samples
-  DataRecord() = default;
+
+  DataRecord() {
+    turns = new std::vector<std::vector<
+        int64_t>>[FLAGS_max_turn_num];  // turns data : FLAGS_max_turn_num
+    turns_mask = new std::vector<std::vector<
+        float>>[FLAGS_max_turn_num];  // turns mask data : FLAGS_max_turn_num
+  }
+
   explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
+      : DataRecord() {
+    this->batch_size = batch_size;
     Load(path);
   }
+
+  ~DataRecord() {
+    delete[] turns;
+    delete[] turns_mask;
+  }
+
   DataRecord NextBatch() {
     DataRecord data;
     size_t batch_end = batch_iter + batch_size;
     // NOTE skip the final batch, if no enough data is provided.
     if (batch_end <= response.size()) {
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
         data.turns[i].assign(turns[i].begin() + batch_iter,
                              turns[i].begin() + batch_end);
       }
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
         data.turns_mask[i].assign(turns_mask[i].begin() + batch_iter,
                                   turns_mask[i].begin() + batch_end);
       }
@@ -60,6 +76,7 @@ struct DataRecord {
     batch_iter += batch_size;
     return data;
   }
+
   void Load(const std::string &path) {
     std::ifstream file(path);
     std::string line;
@@ -69,30 +86,30 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, ',', &data);
-      CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3));
+      CHECK_EQ(data.size(), (size_t)(2 * FLAGS_max_turn_num + 3));
       // load turn data
-      std::vector<int64_t> turns_tmp[MAX_TURN_NUM];
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      std::vector<int64_t> turns_tmp[FLAGS_max_turn_num];
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
         split_to_int64(data[i], ' ', &turns_tmp[i]);
         turns[i].push_back(std::move(turns_tmp[i]));
       }
       // load turn_mask data
-      std::vector<float> turns_mask_tmp[MAX_TURN_NUM];
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
-        split_to_float(data[MAX_TURN_NUM + i], ' ', &turns_mask_tmp[i]);
+      std::vector<float> turns_mask_tmp[FLAGS_max_turn_num];
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
+        split_to_float(data[FLAGS_max_turn_num + i], ' ', &turns_mask_tmp[i]);
         turns_mask[i].push_back(std::move(turns_mask_tmp[i]));
       }
       // load response data
       std::vector<int64_t> response_tmp;
-      split_to_int64(data[2 * MAX_TURN_NUM], ' ', &response_tmp);
+      split_to_int64(data[2 * FLAGS_max_turn_num], ' ', &response_tmp);
       response.push_back(std::move(response_tmp));
       // load response_mask data
       std::vector<float> response_mask_tmp;
-      split_to_float(data[2 * MAX_TURN_NUM + 1], ' ', &response_mask_tmp);
+      split_to_float(data[2 * FLAGS_max_turn_num + 1], ' ', &response_mask_tmp);
       response_mask.push_back(std::move(response_mask_tmp));
       // load result data
       float result_tmp;
-      result_tmp = std::stof(data[2 * MAX_TURN_NUM + 2]);
+      result_tmp = std::stof(data[2 * FLAGS_max_turn_num + 2]);
       result_data.push_back(result_tmp);
     }
     num_samples = num_lines;
@@ -101,8 +118,8 @@ struct DataRecord {
 
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                    int batch_size) {
-  PaddleTensor turns_tensor[MAX_TURN_NUM];
-  PaddleTensor turns_mask_tensor[MAX_TURN_NUM];
+  PaddleTensor turns_tensor[FLAGS_max_turn_num];
+  PaddleTensor turns_mask_tensor[FLAGS_max_turn_num];
   PaddleTensor response_tensor;
   PaddleTensor response_mask_tensor;
   std::string turn_pre = "turn_";
@@ -110,16 +127,16 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 
   auto one_batch = data->NextBatch();
   int size = one_batch.response[0].size();
-  CHECK_EQ(size, MAX_TURN_LEN);
+  CHECK_EQ(size, kMaxTurnLen);
   // turn tensor assignment
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     turns_tensor[i].name = turn_pre + std::to_string(i);
     turns_tensor[i].shape.assign({batch_size, size, 1});
     turns_tensor[i].dtype = PaddleDType::INT64;
     TensorAssignData<int64_t>(&turns_tensor[i], one_batch.turns[i]);
   }
   // turn mask tensor assignment
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     turns_mask_tensor[i].name = turn_mask_pre + std::to_string(i);
     turns_mask_tensor[i].shape.assign({batch_size, size, 1});
     turns_mask_tensor[i].dtype = PaddleDType::FLOAT32;
@@ -137,10 +154,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   TensorAssignData<float>(&response_mask_tensor, one_batch.response_mask);
 
   // Set inputs.
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     input_slots->push_back(std::move(turns_tensor[i]));
   }
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     input_slots->push_back(std::move(turns_mask_tensor[i]));
   }
   input_slots->push_back(std::move(response_tensor));
@@ -202,8 +219,6 @@ TEST(Analyzer_dam, fuse_statis) {
   auto fuse_statis = GetFuseStatis(
       static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
   ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
-  EXPECT_EQ(num_ops, 2020);
 }
 
 // Compare result of NativeConfig and AnalysisConfig
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 956a235edc..adaa338e28 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -27,7 +27,7 @@ struct Record {
 };
 
 Record ProcessALine(const std::string &line) {
-  VLOG(30) << "process a line";
+  VLOG(3) << "process a line";
   std::vector<std::string> columns;
   split(line, '\t', &columns);
   CHECK_EQ(columns.size(), 2UL)
@@ -45,8 +45,8 @@ Record ProcessALine(const std::string &line) {
   for (auto &s : shape_strs) {
     record.shape.push_back(std::stoi(s));
   }
-  VLOG(30) << "data size " << record.data.size();
-  VLOG(30) << "data shape size " << record.shape.size();
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
   return record;
 }
 
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 1dc1678406..d572ea0177 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -178,11 +178,9 @@ void TestOneThreadPrediction(
     warmup_timer.tic();
     predictor->Run(inputs[0], outputs, batch_size);
     PrintTime(batch_size, 1, 1, 0, warmup_timer.toc(), 1);
-#if !defined(_WIN32)
     if (FLAGS_profile) {
       paddle::platform::ResetProfiler();
     }
-#endif
   }
 
   LOG(INFO) << "Run " << num_times << " times...";
@@ -232,11 +230,9 @@ void TestMultiThreadPrediction(
         warmup_timer.tic();
         predictor->Run(inputs[0], outputs, batch_size);
         PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
-#if !defined(_WIN32)
         if (FLAGS_profile) {
           paddle::platform::ResetProfiler();
         }
-#endif
       }
 
       LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index cbcfc964c9..5c1204b9e6 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/time.h>
 #include <time.h>
 #include <fstream>
 #include <thread>  // NOLINT
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 2118fcfd4b..75fa611c0d 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(use_mkldnn);
diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc
index 021edc2de5..d03aa11b75 100644
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
@@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const {
   ss << batch_size_ << "\t";
   ss << num_threads_ << "\t";
   ss << latency_ << "\t";
-  ss << 1000 / latency_;
+  ss << 1000.0 / latency_;
   ss << '\n';
   return ss.str();
 }
diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h
index 80e8f77adb..76a3dd2c29 100644
--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -11,9 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 
 #include <fstream>
 #include <iostream>
+#include <string>
 
 namespace paddle {
 namespace inference {
@@ -31,8 +33,8 @@ struct Benchmark {
   bool use_gpu() const { return use_gpu_; }
   void SetUseGpu() { use_gpu_ = true; }
 
-  int latency() const { return latency_; }
-  void SetLatency(int x) { latency_ = x; }
+  float latency() const { return latency_; }
+  void SetLatency(float x) { latency_ = x; }
 
   const std::string& name() const { return name_; }
   void SetName(const std::string& name) { name_ = name; }
@@ -43,7 +45,7 @@ struct Benchmark {
  private:
   bool use_gpu_{false};
   int batch_size_{0};
-  int latency_;
+  float latency_;
   int num_threads_{1};
   std::string name_;
 };
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index e207a853c8..794d729bdc 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -76,12 +76,12 @@ class ChunkedAllocator : public Allocator {
       default_allocator_ = raw_allocator_;
     } else {
       if (capacity == 1) {
-        VLOG(10) << "Create BestFitAllocator with chunk_size "
-                 << max_chunk_size_;
+        VLOG(1) << "Create BestFitAllocator with chunk_size "
+                << max_chunk_size_;
         default_allocator_ = CreateAllocatorWithChunk();
       } else {
-        VLOG(10) << "Create AutoIncrementAllocator with chunk_size "
-                 << max_chunk_size_ << " and capacity " << capacity;
+        VLOG(1) << "Create AutoIncrementAllocator with chunk_size "
+                << max_chunk_size_ << " and capacity " << capacity;
         default_allocator_ = std::make_shared<AutoIncrementAllocator>(
             [this] { return std::move(CreateAllocatorWithChunk()); }, capacity);
       }
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
index 20748a23a1..b274b05562 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
@@ -99,9 +99,8 @@ TEST(BestFitAllocator, test_concurrent_cpu_allocation) {
 
   LockedAllocator locked_allocator(std::move(best_fit_allocator));
 
-  auto th_main = [&] {
-    std::random_device dev;
-    std::default_random_engine engine(dev());
+  auto th_main = [&](std::random_device::result_type seed) {
+    std::default_random_engine engine(seed);
     std::uniform_int_distribution<size_t> dist(1U, 1024U);
 
     for (size_t i = 0; i < 128; ++i) {
@@ -125,7 +124,8 @@ TEST(BestFitAllocator, test_concurrent_cpu_allocation) {
   {
     std::vector<std::thread> threads;
     for (size_t i = 0; i < 1024; ++i) {
-      threads.emplace_back(th_main);
+      std::random_device dev;
+      threads.emplace_back(th_main, dev());
     }
     for (auto& th : threads) {
       th.join();
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
index f7f17e1d36..fdd5b43ad4 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -41,9 +41,8 @@ TEST(BestFitAllocator, concurrent_cuda) {
   LockedAllocator concurrent_allocator(
       std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
 
-  auto th_main = [&] {
-    std::random_device dev;
-    std::default_random_engine engine(dev());
+  auto th_main = [&](std::random_device::result_type seed) {
+    std::default_random_engine engine(seed);
     std::uniform_int_distribution<size_t> dist(1U, 1024U);
     platform::CUDAPlace gpu(0);
     platform::CUDADeviceContext dev_ctx(gpu);
@@ -75,7 +74,8 @@ TEST(BestFitAllocator, concurrent_cuda) {
   {
     std::vector<std::thread> threads;
     for (size_t i = 0; i < 1024; ++i) {
-      threads.emplace_back(th_main);
+      std::random_device dev;
+      threads.emplace_back(th_main, dev());
     }
     for (auto& th : threads) {
       th.join();
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index e665372723..26e2038a53 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -86,18 +86,18 @@ struct NaiveAllocator {
 
 template <>
 void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
-  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(1) << "Allocate " << size << " bytes on " << platform::Place(place);
   void *p = GetCPUBuddyAllocator()->Alloc(size);
   if (FLAGS_init_allocated_mem) {
     memset(p, 0xEF, size);
   }
-  VLOG(100) << "  pointer=" << p;
+  VLOG(10) << "  pointer=" << p;
   return p;
 }
 
 template <>
 void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) {
-  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  VLOG(1) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
 }
 
@@ -124,12 +124,12 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
           std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
           platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
 
-      VLOG(100) << "\n\nNOTE: each GPU device use "
-                << FLAGS_fraction_of_gpu_memory_to_use * 100
-                << "% of GPU memory.\n"
-                << "You can set GFlags environment variable '"
-                << "FLAGS_fraction_of_gpu_memory_to_use"
-                << "' to change the fraction of GPU usage.\n\n";
+      VLOG(10) << "\n\nNOTE: each GPU device use "
+               << FLAGS_fraction_of_gpu_memory_to_use * 100
+               << "% of GPU memory.\n"
+               << "You can set GFlags environment variable '"
+               << "FLAGS_fraction_of_gpu_memory_to_use"
+               << "' to change the fraction of GPU usage.\n\n";
     }
   });
 
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index dd7ffaa264..26ef27c3ca 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -32,11 +32,11 @@ BuddyAllocator::BuddyAllocator(
       system_allocator_(std::move(system_allocator)) {}
 
 BuddyAllocator::~BuddyAllocator() {
-  VLOG(100) << "BuddyAllocator Disconstructor makes sure that all of these "
-               "have actually been freed";
+  VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
+              "have actually been freed";
   while (!pool_.empty()) {
     auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    VLOG(100) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
@@ -57,12 +57,12 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
 
-  VLOG(100) << "Allocate " << unaligned_size << " bytes from chunk size "
-            << size;
+  VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size "
+           << size;
 
   // if the allocation is huge, send directly to the system allocator
   if (size > max_chunk_size_) {
-    VLOG(100) << "Allocate from system allocator.";
+    VLOG(10) << "Allocate from system allocator.";
     return SystemAlloc(size);
   }
 
@@ -77,9 +77,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
       return nullptr;
     }
   } else {
-    VLOG(100) << "Allocation from existing memory block " << std::get<2>(*it)
-              << " at address "
-              << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+    VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
+             << " at address "
+             << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
   }
 
   total_used_ += size;
@@ -96,10 +96,10 @@ void BuddyAllocator::Free(void* p) {
   // Acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
 
-  VLOG(100) << "Free from address " << block;
+  VLOG(10) << "Free from address " << block;
 
   if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
-    VLOG(100) << "Free directly from system allocator";
+    VLOG(10) << "Free directly from system allocator";
     system_allocator_->Free(block, block->total_size(cache_),
                             block->index(cache_));
 
@@ -116,8 +116,8 @@ void BuddyAllocator::Free(void* p) {
 
   // Trying to merge the right buddy
   if (block->has_right_buddy(cache_)) {
-    VLOG(100) << "Merging this block " << block << " with its right buddy "
-              << block->right_buddy(cache_);
+    VLOG(10) << "Merging this block " << block << " with its right buddy "
+             << block->right_buddy(cache_);
 
     auto right_buddy = block->right_buddy(cache_);
 
@@ -134,8 +134,8 @@ void BuddyAllocator::Free(void* p) {
 
   // Trying to merge the left buddy
   if (block->has_left_buddy(cache_)) {
-    VLOG(100) << "Merging this block " << block << " with its left buddy "
-              << block->left_buddy(cache_);
+    VLOG(10) << "Merging this block " << block << " with its left buddy "
+             << block->left_buddy(cache_);
 
     auto left_buddy = block->left_buddy(cache_);
 
@@ -151,8 +151,8 @@ void BuddyAllocator::Free(void* p) {
   }
 
   // Dumping this block into pool
-  VLOG(100) << "Inserting free block (" << block << ", "
-            << block->total_size(cache_) << ")";
+  VLOG(10) << "Inserting free block (" << block << ", "
+           << block->total_size(cache_) << ")";
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
@@ -174,7 +174,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(&index, size);
 
-  VLOG(100) << "Allocated " << p << " from system allocator.";
+  VLOG(10) << "Allocated " << p << " from system allocator.";
 
   if (p == nullptr) return nullptr;
 
@@ -200,8 +200,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
 
   if (p == nullptr) return pool_.end();
 
-  VLOG(100) << "Creating and inserting new block " << p
-            << " from system allocator";
+  VLOG(10) << "Creating and inserting new block " << p
+           << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
                                      max_chunk_size_, nullptr, nullptr);
@@ -245,19 +245,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
   auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
   pool_.erase(it);
 
-  VLOG(100) << "Split block (" << block << ", " << block->total_size(cache_)
-            << ") into";
+  VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
+           << ") into";
   block->split(&cache_, size);
 
-  VLOG(100) << "Left block (" << block << ", " << block->total_size(cache_)
-            << ")";
+  VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
+           << ")";
   block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
   if (block->has_right_buddy(cache_)) {
     if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      VLOG(100) << "Insert right block (" << block->right_buddy(cache_) << ", "
-                << block->right_buddy(cache_)->total_size(cache_) << ")";
+      VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", "
+               << block->right_buddy(cache_)->total_size(cache_) << ")";
 
       pool_.insert(
           IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
@@ -284,7 +284,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
       return;
     }
 
-    VLOG(100) << "Return block " << block << " to fallback allocator.";
+    VLOG(10) << "Return block " << block << " to fallback allocator.";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
@@ -320,7 +320,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
 
     MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
 
-    VLOG(100) << "Return block " << block << " to base allocator.";
+    VLOG(10) << "Return block " << block << " to base allocator.";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc
index 152e4e7f9f..b86e4f38c4 100644
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -29,7 +29,7 @@ MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
     return existing_desc->second;
   } else {
     auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
-    VLOG(100) << "Load MemoryBlock::Desc type=" << desc->type;
+    VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
     PADDLE_ASSERT(desc->check_guards());
     return *reinterpret_cast<const MemoryBlock::Desc*>(block);
   }
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 2019d1a14f..3e8fb83e9d 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -86,7 +86,11 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
     munlock(p, size);
 #endif
   }
+#ifdef _WIN32
+  _aligned_free(p);
+#else
   free(p);
+#endif
 }
 
 bool CPUAllocator::UseGpu() const { return false; }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index de4f23515d..8c8dc7026e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -37,7 +37,13 @@ if (WITH_GPU)
     SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
 endif()
 
-register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})
+SET(OP_PREFETCH_DEPS "")
+if (WITH_DISTRIBUTE)
+    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
+endif()
+
+register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+
 
 # warpctc_op needs cudnn 7 above
 if (WITH_GPU AND NOT WIN32)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index bb9ea3f3ba..832245371e 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -149,6 +149,13 @@ $out = \max(x, 0)$
 
 )DOC";
 
+UNUSED constexpr char GeluDoc[] = R"DOC(
+Gelu Activation Operator.
+
+$out = \\frac{1 + erf(\\frac{x}{\\sqrt{2}})}{2} x$
+
+)DOC";
+
 UNUSED constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.
 
@@ -472,6 +479,7 @@ REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
 REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
+REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc);
 REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
 REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
 REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
@@ -489,6 +497,7 @@ REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
 
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Gelu, gelu);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh);
 REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil);
@@ -525,6 +534,7 @@ namespace ops = paddle::operators;
   __macro(Round, round);             \
   __macro(Log, log);                 \
   __macro(Square, square);           \
+  __macro(Gelu, gelu);               \
   __macro(BRelu, brelu);             \
   __macro(Pow, pow);                 \
   __macro(STanh, stanh);             \
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 4ffc7f364b..a0f8c5c14c 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -16,6 +16,11 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include <cmath>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
@@ -95,7 +100,7 @@ class ActivationGradKernel
       auto x = framework::EigenVector<T>::Flatten(*X);
       functor(*place, x, out, dout, dx);
     } else {
-      VLOG(100) << " Inplace activation ";
+      VLOG(10) << " Inplace activation ";
       auto x = framework::EigenVector<T>::Flatten(*dX);
       functor(*place, x, out, dout, dx);
     }
@@ -212,6 +217,31 @@ struct ReluGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+// gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
+template <typename T>
+struct GeluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp =
+        ((x * static_cast<T>(M_SQRT1_2)).erf()).template cast<T>().eval();
+    out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
+  }
+};
+
+template <typename T>
+struct GeluGradFunctor : BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("gelu"); }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp = (static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
+                 ((-static_cast<T>(0.5) * x.square()).exp()))
+                    .template cast<T>()
+                    .eval();
+    dx.device(d) = dout * (out / x + temp);
+  }
+};
+
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
 struct TanhFunctor : public BaseActivationFunctor<T> {
@@ -877,6 +907,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
   __macro(exp, ExpFunctor, ExpGradFunctor);                          \
   __macro(relu, ReluFunctor, ReluGradFunctor);                       \
+  __macro(gelu, GeluFunctor, GeluGradFunctor);                       \
   __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
   __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
   __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
index eddf34494b..4309f0a549 100644
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -49,7 +49,7 @@ class ArrayOp : public framework::OperatorBase {
     } else {
       offset = static_cast<size_t>(*i_tensor.data<int64_t>());
     }
-    VLOG(100) << " Offset = " << offset;
+    VLOG(10) << " Offset = " << offset;
     return offset;
   }
 };
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 3c40135eca..6257e04b01 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -148,8 +148,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
 
         size_t start_offset = lod_and_offset.second.first;
         size_t end_offset = lod_and_offset.second.second;
-        VLOG(100) << "idx=" << idx << " x_idx=" << x_idx << " ["
-                  << ", " << end_offset << "]";
+        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
+                 << ", " << end_offset << "]";
         // Copy data
         PADDLE_ENFORCE_GE(end_offset, start_offset);
         size_t len = end_offset - start_offset;
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index de641cb08e..bddca232e6 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
@@ -146,7 +146,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+    bool global_stats = is_test || use_global_stats;
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto *mean = ctx.Input<Tensor>("Mean");
@@ -177,13 +179,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     T *batch_mean_data = nullptr;
     T *batch_variance_data = nullptr;
 
-    if (!is_test) {
+    if (!global_stats) {
       batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
       batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
     }
 
-    auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
-                                       : mkldnn::prop_kind::forward_training;
+    auto propagation = global_stats == true
+                           ? mkldnn::prop_kind::forward_scoring
+                           : mkldnn::prop_kind::forward_training;
 
     auto src_tz = paddle::framework::vectorize2int(x->dims());
     auto scale_tz = paddle::framework::vectorize2int(scale->dims());
@@ -199,7 +202,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                     shift->data<T>() + ic, &scaleshift_data);
 
     unsigned flags = mkldnn::use_scale_shift;
-    if (is_test) flags |= mkldnn::use_global_stats;
+    if (global_stats) flags |= mkldnn::use_global_stats;
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
@@ -208,7 +211,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     // keys for backward pass
     const std::string key = BatchNormMKLDNNHandler::GetHash(
-        src_tz, epsilon, flags, is_test, input_format,
+        src_tz, epsilon, flags, global_stats, input_format,
         ctx.op().Output("SavedMean"));
     const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
 
@@ -239,7 +242,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data);
 
     std::shared_ptr<batch_norm_fwd> batch_norm_p;
-    if (is_test) {
+    if (global_stats) {
       // create mkldnn memory for stats (as input)
       std::shared_ptr<memory> mean_memory =
           handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data));
@@ -269,7 +272,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     pipeline.push_back(*batch_norm_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 
-    if (!is_test) {
+    if (!global_stats) {
       // mkldnn only compute stats for current batch
       // so we need compute momentum stats via Eigen lib
       EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 2463c939bc..f66813989c 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -159,6 +159,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("fuse_with_relu",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<bool>("use_global_stats",
+                  "(bool, default false) Whether to use global mean and "
+                  "variance. In inference or test mode, set use_global_stats "
+                  "to true or is_test true. the behavior is equivalent. "
+                  "In train mode, when setting use_global_stats True, the "
+                  "global mean and variance are also used during train time, "
+                  "the BN acts as scaling and shiffting.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Batch Normalization.
 
@@ -190,6 +198,10 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+
+    bool global_stats = is_test || use_global_stats;
+
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
@@ -217,7 +229,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     saved_mean->mutable_data<T>(ctx.GetPlace());
     saved_variance->mutable_data<T>(ctx.GetPlace());
 
-    if (!is_test) {
+    if (!global_stats) {
       // saved_xx is use just in this batch of data
       EigenVectorArrayMap<T> saved_mean_e(
           saved_mean->mutable_data<T>(ctx.GetPlace()), C);
@@ -234,7 +246,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       if ((N * sample_size) == 1) {
         LOG(WARNING) << "Only 1 element in normalization dimension, "
                      << "we skip the batch norm calculation, let y = x.";
-        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        framework::TensorCopy(*x, ctx.GetPlace(), y);
         return;
       }
 
@@ -277,7 +289,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
 
     // use SavedMean and SavedVariance to do normalize
     Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    if (is_test) {
+    if (global_stats) {
       ConstEigenVectorArrayMap<T> var_arr(
           ctx.Input<Tensor>("Variance")->data<T>(), C);
       inv_std = (var_arr + epsilon).sqrt().inverse();
@@ -289,8 +301,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       inv_std = saved_inv_std;
     }
     ConstEigenVectorArrayMap<T> mean_arr(
-        is_test ? ctx.Input<Tensor>("Mean")->data<T>()
-                : ctx.Output<Tensor>("SavedMean")->data<T>(),
+        global_stats ? ctx.Input<Tensor>("Mean")->data<T>()
+                     : ctx.Output<Tensor>("SavedMean")->data<T>(),
         C);
 
     //   ((x - est_mean) * (inv_var) * scale + bias
@@ -336,15 +348,27 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     // check input
     PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
-    PADDLE_ENFORCE(ctx->HasInput("SavedMean"), "");
-    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
+                   "Input(SavedMean) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
+                   "Input(SavedVariance) should not be null");
 
     // check output
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                     "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
+                     "null at same time");
+    }
+    const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+    if (use_global_stats) {
+      PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
+                     "Using global stats during training is not supported "
+                     "in gradient op kernel of batch_norm_mkldnn_op now.");
+    }
 
     const auto x_dims = ctx->GetInputDim("X");
     const DataLayout data_layout = framework::StringToDataLayout(
@@ -354,8 +378,10 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
                                           : x_dims[x_dims.size() - 1]);
 
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+      ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+    }
   }
 
  protected:
@@ -405,6 +431,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     // SavedVariance have been reverted in forward operator
     const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const float epsilon = ctx.Attr<float>("epsilon");
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
 
@@ -419,38 +447,60 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                                           : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> mean_arr(saved_mean->data<T>(), C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(saved_inv_variance->data<T>(), C);
-
     // init output
     auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
     auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    const T *mean_data = saved_mean->data<T>();
+    const T *inv_var_data = saved_inv_variance->data<T>();
+    Tensor inv_var_tensor;
+    if (use_global_stats) {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
+      const auto *running_variance = ctx.Input<Tensor>("Variance");
+      mean_data = running_mean->data<T>();
+      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
+      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+      inv_var_tmp = (var_arr + epsilon).sqrt().inverse().eval();
+      inv_var_data = running_inv_var_data;
+    }
+
+    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+    T *d_bias_data = nullptr;
+    T *d_scale_data = nullptr;
+    if (d_scale && d_bias) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      d_bias_data = d_bias->mutable_data<T>(ctx.GetPlace());
+      d_scale_data = d_scale->mutable_data<T>(ctx.GetPlace());
+    }
 
     // d_bias = np.sum(d_y, axis=0)
     // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
     // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
     //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+    EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
+    EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
 
-    EigenVectorArrayMap<T> d_bias_arr(d_bias->mutable_data<T>(ctx.GetPlace()),
-                                      C);
-    EigenVectorArrayMap<T> d_scale_arr(d_scale->mutable_data<T>(ctx.GetPlace()),
-                                       C);
-
-    d_bias_arr.setZero();
-    d_scale_arr.setZero();
+    if (d_scale && d_bias) {
+      d_bias_arr.setZero();
+      d_scale_arr.setZero();
+    }
 
-    if ((N * sample_size) == 1) {
-      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+    if ((N * sample_size) == 1 && !use_global_stats) {
+      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
       return;
     }
 
-    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
+    int scale_coefff = use_global_stats ? 1 : N * sample_size;
+    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
 
     switch (data_layout) {
       case DataLayout::kNCHW: {
@@ -460,19 +510,29 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                                  sample_size, N * C);
         d_x_arr.setZero();
 
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          d_bias_arr(c) += d_y_arr.col(nc).sum();
-          d_scale_arr(c) +=
-              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
-                  .sum();
+        if (d_scale && d_bias) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_bias_arr(c) += d_y_arr.col(nc).sum();
+            d_scale_arr(c) += ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) *
+                               d_y_arr.col(nc))
+                                  .sum();
+          }
         }
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          d_x_arr.col(nc) +=
-              scale_inv_var_nhw(c) *
-              (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
-               (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c));
+        if (!use_global_stats) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) +=
+                scale_inv_var_nhw(c) *
+                (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
+                 (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) *
+                     inv_var_arr(c));
+          }
+        } else {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) += scale_inv_var_nhw(c) * d_y_arr.col(nc);
+          }
         }
         break;
       }
@@ -488,15 +548,27 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
         const auto d_y_mul_x_minus_mean_row_sum =
             (d_y_arr * x_minus_mean).rowwise().sum();
         const auto inv_var_sqr = inv_var_arr * inv_var_arr;
-        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-          d_bias_arr += d_y_arr.col(nhw);
-          d_scale_arr +=
-              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
-          d_x_arr.col(nhw) +=
-              scale_inv_var_nhw *
-              (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
-               x_minus_mean.col(nhw) * inv_var_sqr *
-                   d_y_mul_x_minus_mean_row_sum);
+
+        if (d_scale && d_bias) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_bias_arr += d_y_arr.col(nhw);
+            d_scale_arr +=
+                (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+          }
+        }
+
+        if (!use_global_stats) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) +=
+                scale_inv_var_nhw *
+                (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
+                 x_minus_mean.col(nhw) * inv_var_sqr *
+                     d_y_mul_x_minus_mean_row_sum);
+          }
+        } else {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) += scale_inv_var_nhw * d_y_arr.col(nhw);
+          }
         }
         break;
       }
@@ -522,6 +594,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("SavedMean", Output("SavedMean"));
     op->SetInput("SavedVariance", Output("SavedVariance"));
 
+    // used when setting use_global_stats True during training
+    op->SetInput("Mean", Output("MeanOut"));
+    op->SetInput("Variance", Output("VarianceOut"));
+
     op->SetAttrMap(Attrs());
 
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu
similarity index 57%
rename from paddle/fluid/operators/batch_norm_op.cu.cc
rename to paddle/fluid/operators/batch_norm_op.cu
index 0609027c69..1c45746a92 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/batch_norm_op.h"
+#include <algorithm>
 #include <cfloat>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
@@ -59,6 +63,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
@@ -96,7 +101,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     mode_ = CUDNN_BATCHNORM_SPATIAL;
 #endif
 
-    VLOG(30) << "Setting descriptors.";
+    VLOG(3) << "Setting descriptors.";
     std::vector<int> dims;
     std::vector<int> strides;
     if (data_layout == DataLayout::kNCHW) {
@@ -121,7 +126,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     auto handle = dev_ctx.cudnn_handle();
 
     // Now, depending on whether we are running test or not, we have two paths.
-    if (is_test) {
+    if (is_test || use_global_stats) {
       // only when test we use input to do computation.
       const auto *est_mean = ctx.Input<Tensor>("Mean");
       const auto *est_var = ctx.Input<Tensor>("Variance");
@@ -163,7 +168,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       if ((N * H * W * D) == 1) {
         LOG(WARNING) << "Only 1 element in normalization dimension, "
                      << "we skip the batch norm calculation, let y = x.";
-        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        framework::TensorCopy(*x, ctx.GetPlace(), y);
       } else {
         double this_factor = 1. - momentum;
 
@@ -191,6 +196,58 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
   }
 };
 
+template <typename T, framework::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon, const int C,
+                                        const int HxW, const int num, T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ void KeBNBackwardScaleBias(
+    const T *dy, const T *x, const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance, const double epsilon, const int N,
+    const int C, const int HxW, BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    BatchNormParamType<T> mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
+                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum * inv_var_i;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+
 template <typename T>
 class BatchNormGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -200,6 +257,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
                    "It must use CUDAPlace.");
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const auto *x = ctx.Input<Tensor>("X");
@@ -219,42 +278,13 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    if ((N * H * W * D) == 1) {
-      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
-      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-          functor;
-      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-      return;
+    if (d_scale && d_bias) {
+      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
     }
-
     PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
     PADDLE_ENFORCE_EQ(scale->dims()[0], C);
 
-    // ------------------- cudnn descriptors ---------------------
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if CUDNN_VERSION_MIN(7, 0, 0)
-    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-#else
-    mode_ = CUDNN_BATCHNORM_SPATIAL;
-#endif
-
     std::vector<int> dims;
     std::vector<int> strides;
     if (data_layout == DataLayout::kNCHW) {
@@ -264,34 +294,114 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       dims = {N, C, H, W, D};
       strides = {H * W * C * D, 1, W * D * C, D * C, C};
     }
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
-
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const void *saved_mean_data =
-        saved_mean->template data<BatchNormParamType<T>>();
-    const void *saved_var_data =
-        saved_var->template data<BatchNormParamType<T>>();
-
-    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
-        data_desc_, d_y->template data<T>(), data_desc_,
-        d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-        scale->template data<BatchNormParamType<T>>(),
-        d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-        d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-        epsilon, saved_mean_data, saved_var_data));
 
-    // clean when exit.
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    if (!use_global_stats) {
+      if ((N * H * W * D) == 1) {
+        framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+        math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+            functor;
+        functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+        functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+        return;
+      }
+
+      // ------------------- cudnn descriptors ---------------------
+      cudnnTensorDescriptor_t data_desc_;
+      cudnnTensorDescriptor_t bn_param_desc_;
+      cudnnBatchNormMode_t mode_;
+
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+      if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+        LOG(ERROR) << "Provided epsilon is smaller than "
+                   << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                   << "CUDNN_BN_MIN_EPSILON instead.";
+      }
+      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#if CUDNN_VERSION_MIN(7, 0, 0)
+      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+#endif
+
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_, CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
+          bn_param_desc_, data_desc_, mode_));
+
+      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+      const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+      const void *saved_mean_data =
+          saved_mean->template data<BatchNormParamType<T>>();
+      const void *saved_var_data =
+          saved_var->template data<BatchNormParamType<T>>();
+
+      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
+          dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+          data_desc_, d_y->template data<T>(), data_desc_,
+          d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+          scale->template data<BatchNormParamType<T>>(),
+          d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+          d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+          epsilon, saved_mean_data, saved_var_data));
+
+      // clean when exit.
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+    } else {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
+      const auto *running_var = ctx.Input<Tensor>("Variance");
+
+      const auto *running_mean_data =
+          running_mean->template data<BatchNormParamType<T>>();
+      const auto *running_var_data =
+          running_var->template data<BatchNormParamType<T>>();
+
+      const int num = x->numel();
+      const int block = 512;
+      int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+      const int max_blocks = std::max(max_threads / block, 1);
+      int grid1 = (num + block - 1) / block;
+      int grid2 = std::min(C, max_blocks);
+
+      if (data_layout == framework::DataLayout::kNCHW) {
+        if (d_x) {
+          KeBNBackwardData<T, framework::DataLayout::kNCHW><<<
+              grid1, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
+              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+              grid2, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
+              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      } else {
+        if (d_x) {
+          KeBNBackwardData<T, framework::DataLayout::kNHWC><<<
+              grid1, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
+              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+              grid2, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
+              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      }
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 791f8a4d3b..62771d09f1 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -33,11 +33,11 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
 
   auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
   auto selected_items = ToMap(items, high_level.back());
-  VLOG(30) << "selected_items:";
+  VLOG(3) << "selected_items:";
   for (size_t i = 0; i < selected_items.size(); ++i) {
-    VLOG(30) << "offset:" << i;
+    VLOG(3) << "offset:" << i;
     for (auto &item : selected_items[i]) {
-      VLOG(30) << ItemToString(item);
+      VLOG(3) << ItemToString(item);
     }
   }
 
@@ -138,11 +138,11 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
     }
     result.emplace_back(items);
   }
-  VLOG(30) << "SelectTopBeamSizeItems result size " << result.size();
+  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
   for (auto &items : result) {
-    VLOG(30) << "item set:";
+    VLOG(3) << "item set:";
     for (auto &item : items) {
-      VLOG(30) << ItemToString(item);
+      VLOG(3) << ItemToString(item);
     }
   }
 
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
index 501807e7f3..40b46781da 100644
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -46,7 +46,7 @@ void CreateInput(LoDTensor* ids, LoDTensor* scores) {
   auto* scores_data = scores->mutable_data<float>(place);
   vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
   vector<float> _scores(
-      {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1});
+      {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
 
   for (int i = 0; i < 12; i++) {
     ids_data[i] = _ids[i];
@@ -80,7 +80,7 @@ TEST(DISABLED_beam_search_op, run) {
   ASSERT_EQ(sids.lod(), sscores.lod());
 
   vector<int> tids({4, 2, 3, 8});
-  vector<float> tscores({0.5, 0.6, 0.9, 0.7});
+  vector<float> tscores({0.5f, 0.6f, 0.9f, 0.7f});
 
   for (int i = 0; i < 4; i++) {
     ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
index f23336f7b9..5017c3a457 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -70,7 +70,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
     if (bias) {
       auto bias_vec = EigenMatrix<T>::From(*bias);
       Eigen::DSizes<int, 2> bcast(batch_size, 1);
-      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+      output_mat.device(place) = bias_vec.broadcast(bcast).eval() + output_mat;
     }
   }
 };
@@ -99,13 +99,13 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
     auto d_out_mat = EigenMatrix<T>::From(*d_out);
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // Create the intermediate variable to caculate the Output(Y@Grad).
+    // Create the intermediate variable to calculate the Output(Y@Grad).
     Tensor x_scale;
     x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
                             ctx.GetPlace());
     auto x_scale_mat = EigenMatrix<T>::From(x_scale);
 
-    // Create the intermediate variable to caculate the Output(X@Grad).
+    // Create the intermediate variable to calculate the Output(X@Grad).
     Tensor y_scale;
     y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
                             ctx.GetPlace());
@@ -113,65 +113,64 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
 
     math::SetConstant<DeviceContext, T> set_zero;
 
-    // Set Output(X@Grad) be zero.
     if (d_x) {
       d_x->mutable_data<T>(ctx.GetPlace());
       set_zero(dev_ctx, d_x, static_cast<T>(0));
     }
 
-    // Set Output(Y@Grad) be zero.
     if (d_y) {
       d_y->mutable_data<T>(ctx.GetPlace());
       set_zero(dev_ctx, d_y, static_cast<T>(0));
     }
 
+    if (d_weight) {
+      d_weight->mutable_data<T>(ctx.GetPlace());
+    }
+
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
 
     // Caculate the Output(X@Grad) and Output(Y@Grad).
-    if (d_x || d_y) {
+    if (d_x || d_y || d_weight) {
       Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
       Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+
       for (int i = 0; i < out_dim; ++i) {
         Tensor weight_i = weight->Slice(i, i + 1).Resize(
             framework::make_ddim({x_dim, y_dim}));
         auto output_vec = d_out_mat.chip(i, 1);
+
         if (d_x) {
           y_scale_mat.device(place) =
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_x) *
+                  .broadcast(bcast_for_x)
+                  .eval() *
               y_mat;
           blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
                     y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
         }
-        if (d_y) {
-          x_scale_mat.device(place) =
+
+        if (d_y || d_weight) {
+          auto output_vec_y =
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                  .broadcast(bcast_for_y) *
-              x_mat;
-          blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-                    x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
+                  .broadcast(bcast_for_y)
+                  .eval();
+          x_scale_mat.device(place) = output_vec_y * x_mat;
+          if (d_y) {
+            blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+                      x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
+          }
+          if (d_weight) {
+            Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
+                framework::make_ddim({x_dim, y_dim}));
+            blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
+                      x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
+          }
         }
       }
     }
 
-    // Caculate the gradient of Input(Weight).
-    if (d_weight) {
-      d_weight->mutable_data<T>(ctx.GetPlace());
-      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
-      for (int i = 0; i < out_dim; ++i) {
-        Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
-            framework::make_ddim({x_dim, y_dim}));
-        auto output_vec = d_out_mat.chip(i, 1);
-        x_scale_mat.device(place) =
-            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                .broadcast(bcast_for_weight) *
-            x_mat;
-        blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
-                  x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
-      }
-    }
-
-    // Caculate the gradient of Input(Bias).
+    // calculate the gradient of Input(Bias).
     if (d_bias) {
       d_bias->mutable_data<T>(ctx.GetPlace());
       auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 093b0a9a1f..57817da71a 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -37,7 +37,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
     if (n == 1) {
-      VLOG(30) << "Warning: concat op have only one input, may waste memory";
+      VLOG(3) << "Warning: concat op have only one input, may waste memory";
     }
 
     auto out_dims = ins[0];
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 5da0a536d9..dc7ef66495 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -47,8 +47,8 @@ class FeedOp : public framework::OperatorBase {
 
     auto col = Attr<int>("col");
 
-    VLOG(30) << "Feed Var " << feed_var_name << "'s " << col
-             << " column to var " << out_name;
+    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var "
+            << out_name;
 
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index c9e759ebff..c197b45e81 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -57,7 +57,7 @@ class FetchOp : public framework::OperatorBase {
     TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
     dst_item.set_lod(src_item.lod());
 
-    VLOG(30) << "Fetch variable " << fetch_var_name << " to " << out_name;
+    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
   }
 };
 
diff --git a/paddle/fluid/operators/controlflow/parallel_do_op.cc b/paddle/fluid/operators/controlflow/parallel_do_op.cc
index c795d4bdd1..ab25628d45 100644
--- a/paddle/fluid/operators/controlflow/parallel_do_op.cc
+++ b/paddle/fluid/operators/controlflow/parallel_do_op.cc
@@ -48,7 +48,7 @@ static void SplitTensorAndMoveTensorToScopes(
     auto lod_tensors = tensor.SplitLoDTensor(places);
 
     for (auto &lod : lod_tensors) {
-      VLOG(30) << lod.dims();
+      VLOG(3) << lod.dims();
     }
     if (num_sub_scopes == 0) {
       num_sub_scopes = lod_tensors.size();
@@ -263,7 +263,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
       if (s == framework::kEmptyVarName) {
         continue;
       }
-      VLOG(30) << "Moving " << s;
+      VLOG(3) << "Moving " << s;
       CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
     }
     WaitOnPlaces(places);
@@ -277,7 +277,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
       if (s == framework::kEmptyVarName) {
         continue;
       }
-      VLOG(30) << "Accumulating " << s;
+      VLOG(3) << "Accumulating " << s;
       if (s == framework::kEmptyVarName) continue;
       std::string tmp_name;
       auto *tmp = sub_scopes[0]->Var(&tmp_name);
@@ -289,7 +289,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
             framework::AttributeMap{{"use_mkldnn", {false}}});
-        VLOG(100) << sum_op->DebugStringEx(sub_scopes[0]);
+        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
         sum_op->Run(*sub_scopes[0], places[0]);
         WaitOnPlace(places[0]);
       }
@@ -316,7 +316,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
     auto *grad = new framework::OpDesc();
     grad->SetType("parallel_do_grad");
     for (auto &input_param : this->InputNames()) {
-      VLOG(30) << input_param;
+      VLOG(3) << input_param;
       grad->SetInput(input_param, this->Input(input_param));
       if (input_param != kPlaces) {
         grad->SetOutput(framework::GradVarName(input_param),
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index 484160aeb8..fa18ade323 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -34,8 +34,8 @@ class WriteToArrayOp : public ArrayOp {
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
     if (offset >= out->size()) {
-      VLOG(100) << "Resize " << Output("Out") << " from " << out->size()
-                << " to " << offset + 1;
+      VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
+               << " to " << offset + 1;
       out->resize(offset + 1);
     }
     auto *out_tensor = &out->at(offset);
@@ -47,9 +47,9 @@ class WriteToArrayOp : public ArrayOp {
 
       TensorCopy(x_tensor, place, dev_ctx, out_tensor);
     } else {
-      VLOG(100) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
-                   "nothing has been written to output array["
-                << offset << "].";
+      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                  "nothing has been written to output array["
+               << offset << "].";
     }
   }
 };
@@ -104,7 +104,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
                   framework::BlockDesc *block) const override {
     auto x_name = op_desc.Input("X")[0];
     auto out_name = op_desc.Output("Out")[0];
-    VLOG(100) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
+    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
     auto &out = block->FindRecursiveOrCreateVar(out_name);
     out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
     auto *x = block->FindVarRecursive(x_name);
@@ -139,7 +139,7 @@ class ReadFromArrayOp : public ArrayOp {
       framework::TensorCopy(x_array[offset], place, dev_ctx, out_tensor);
       out_tensor->set_lod(x_array[offset].lod());
     } else {
-      VLOG(100) << "offset " << offset << " >= " << x_array.size();
+      VLOG(10) << "offset " << offset << " >= " << x_array.size();
     }
   }
 };
@@ -167,6 +167,19 @@ $$T = A[i]$$
 };
 
 class ReadFromArrayInferShape : public WriteToArrayInferShape {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    WriteToArrayInferShape::operator()(context);
+    if (!context->HasInput("X")) {
+      return;
+    }
+
+    // FIXME: just for compile time.
+    if (!context->IsRuntime()) {
+      context->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+
  protected:
   const char *NotHasXError() const override {
     return "The input array X must be set";
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 2b56514fe0..6c1b2f329a 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -132,15 +132,15 @@ class WhileGradOp : public framework::OperatorBase {
 
     for (auto cur_scope_iter = step_scopes->rbegin();
          cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
-      VLOG(30) << "Start backward at time_step "
-               << cur_scope_iter - step_scopes->rbegin();
+      VLOG(3) << "Start backward at time_step "
+              << cur_scope_iter - step_scopes->rbegin();
       framework::Scope &cur_scope = **cur_scope_iter;
       // Link OG from outside to inside
       for (size_t i = 0; i < outside_og_names.size(); ++i) {
         auto outside_og_name = outside_og_names[i];
         auto inside_og_name = inside_og_names[i];
-        VLOG(80) << "Linking outside " << outside_og_name << " --> inside "
-                 << inside_og_name;
+        VLOG(8) << "Linking outside " << outside_og_name << " --> inside "
+                << inside_og_name;
         if (scope.FindVar(outside_og_name) == nullptr) {
           continue;
         }
@@ -162,11 +162,11 @@ class WhileGradOp : public framework::OperatorBase {
           auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
           auto &inside_array =
               detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
-          VLOG(80) << outside_og_name << " size = " << outside_array.size();
+          VLOG(8) << outside_og_name << " size = " << outside_array.size();
           inside_array.resize(outside_array.size());
 
           for (size_t j = 0; j < inside_array.size(); ++j) {
-            VLOG(80) << j << " " << outside_array[j].numel();
+            VLOG(8) << j << " " << outside_array[j].numel();
             if (outside_array[j].numel() != 0) {
               inside_array[j].set_lod(outside_array[j].lod());
               inside_array[j].ShareDataWith(outside_array[j]);
@@ -292,7 +292,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
     for (auto &each_ig : igs) {
       if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) {
-        VLOG(80) << "Ignore " << each_ig;
+        VLOG(8) << "Ignore " << each_ig;
         each_ig = framework::kEmptyVarName;
       }
     }
@@ -356,8 +356,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
       auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
       auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
       if (g_var != nullptr) {  // Gradient could be @EMPTY@
-        VLOG(50) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
-                 << " type: " << p_var.GetType();
+        VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
+                << " type: " << p_var.GetType();
         g_var->SetType(p_var.GetType());
         g_var->SetDataType(p_var.GetDataType());
       }
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 42c2b3a24c..dbb6ffd5e2 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -151,11 +151,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
       // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
       half_float = true;
-      VLOG(50) << "use cudnn_tensor_op_math";
+      VLOG(5) << "use cudnn_tensor_op_math";
     } else {
       CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-      VLOG(50) << "NOT use cudnn_tensor_op_math";
+      VLOG(5) << "NOT use cudnn_tensor_op_math";
     }
 #endif
 
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 9e2e2cf818..05e268bf6a 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
@@ -28,259 +28,6 @@ using mkldnn::stream;
 using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
 
-class ConvMKLDNNHandler : public platform::MKLDNNHandler {
- public:
-  ConvMKLDNNHandler(
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
-    conv_pd_ = conv_pd;
-  }
-
-  ConvMKLDNNHandler(
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
-      std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
-          conv_bwd_data_pd,
-      std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
-          conv_bwd_weights_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        conv_pd_(conv_pd),
-        conv_bwd_weights_pd_(conv_bwd_weights_pd),
-        conv_bwd_data_pd_(conv_bwd_data_pd) {
-    // If we are in Grad operatgor then update a key with BWD suffix to
-    // distinguish from FWD memory primitives
-    key_ += "-BWD";
-  }
-
-  size_t GetDstMemorySize() const {
-    return conv_pd_->dst_primitive_desc().get_size();
-  }
-
-  mkldnn::memory::format GetDstFormat() const {
-    return static_cast<mkldnn::memory::format>(
-        conv_pd_->dst_primitive_desc().desc().data.format);
-  }
-
-  size_t GetDiffWeightsMemorySize() const {
-    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
-  }
-
-  size_t GetDiffSourceMemorySize() const {
-    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(src_pd, user_pd, user_memory_p,
-                               "@weights-src_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
-                               "@weights-diff_dst_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
-      void* ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
-        "@diff_weights_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
-                               "@data-diff_dst_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
-    auto user_pd = user_weights_memory_p->get_primitive_desc();
-    return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
-                               "@data-weights_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
-      const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
-      void* dst_ptr,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    return this->AcquireMemory(user_residual_memory_p,
-                               this->AcquireDstMemoryFromPrimitive(dst_ptr),
-                               "@residual_data_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
-      void* ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
-    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
-                                            "@dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto src_pd = conv_pd_->src_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
-                               pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false) {
-    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
-    auto weights_pd = conv_pd_->weights_primitive_desc();
-    return this->AcquireMemory(weights_pd, user_weights_pd,
-                               user_weights_memory_p, "@weights_mem_p",
-                               pipeline, is_persistent);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
-    auto bias_pd = conv_pd_->bias_primitive_desc();
-    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
-                               "@bias_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> dst_memory_p) {
-    auto prim_key = key_ + "@conv_p";
-    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
-        dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find convolution primitive in device context");
-    if (conv_p == nullptr) {
-      conv_p = std::make_shared<mkldnn::convolution_forward>(
-          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
-          *(dst_memory_p.get()));
-
-      dev_ctx_.SetBlob(prim_key, conv_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_p;
-  }
-
-  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> bias_memory_p,
-      std::shared_ptr<mkldnn::memory> dst_memory_p) {
-    auto prim_key = key_ + "@conv_p";
-    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
-        dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find convolution primitive in device context");
-    if (conv_p == nullptr) {
-      conv_p = std::make_shared<mkldnn::convolution_forward>(
-          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
-          *(bias_memory_p.get()), *(dst_memory_p.get()));
-
-      dev_ctx_.SetBlob(prim_key, conv_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_p;
-  }
-
-  std::shared_ptr<mkldnn::convolution_backward_weights>
-  AcquireConvolutionBackwardWeights(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
-    auto prim_key = key_ + "@conv_bwd_weights_p";
-    auto conv_bwd_weights_p =
-        std::static_pointer_cast<mkldnn::convolution_backward_weights>(
-            dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE(
-        (conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
-        "Fail to find convolution bwd weights primitive in device context");
-    if (conv_bwd_weights_p == nullptr) {
-      // create backward conv primitive for weights
-      conv_bwd_weights_p =
-          std::make_shared<mkldnn::convolution_backward_weights>(
-              *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
-              *diff_weights_memory_p);
-      dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_bwd_weights_p;
-  }
-
-  std::shared_ptr<mkldnn::convolution_backward_data>
-  AcquireConvolutionBackwardData(
-      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
-    auto prim_key = key_ + "@conv_bwd_data_p";
-    auto conv_bwd_data_p =
-        std::static_pointer_cast<mkldnn::convolution_backward_data>(
-            dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE(
-        (conv_bwd_data_p != nullptr) || (is_reusing_ == false),
-        "Fail to find convolution bwd data primitive in device context");
-    if (conv_bwd_data_p == nullptr) {
-      conv_bwd_data_p = std::make_shared<mkldnn::convolution_backward_data>(
-          *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
-          *diff_src_memory_p);
-      dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_bwd_data_p;
-  }
-
-  // Generate keys for storing/retriving primitives for this operator
-  // TODO(jczaja): Make hashing function more optimial
-  static std::string GetHash(memory::dims& input_dims,     // NOLINT
-                             memory::dims& weights_dims,   // NOLINT
-                             std::vector<int>& strides,    // NOLINT
-                             std::vector<int>& paddings,   // NOLINT
-                             std::vector<int>& dilations,  // NOLINT
-                             int groups, const std::string& suffix) {
-    return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
-           dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
-           suffix;
-  }
-
- private:
-  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd_;
-  std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
-      conv_bwd_weights_pd_;
-  std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
-      conv_bwd_data_pd_;
-};
-
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -351,7 +98,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
     // Get unique name for storing MKLDNN primitives
-    const std::string key = ConvMKLDNNHandler::GetHash(
+    const std::string key = platform::ConvMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
         ctx.op().Output("Output"));
     const std::string key_conv_pd = key + "@conv_pd";
@@ -400,7 +147,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     // Save conv_pd/src_memory/weights_memory for backward pass
     if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
 
-    ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
+    platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
     auto user_src_memory_p =
@@ -616,9 +363,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     // Get an unique name from "argument" name of "Output" variable
     // as well as attributes of primitive to be created
     // This name will be used as key when saving info into device context
-    const std::string key =
-        ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings,
-                                   dilations, groups, ctx.op().Input("Output"));
+    const std::string key = platform::ConvMKLDNNHandler::GetHash(
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Input("Output"));
 
     const std::string key_conv_pd = key + "@conv_pd";
     std::vector<primitive> pipeline;
@@ -673,8 +420,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
             conv_bwd_data_desc, mkldnn_engine, *conv_pd);
 
-    ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, conv_bwd_weights_pd,
-                              dev_ctx, mkldnn_engine, key);
+    platform::ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd,
+                                        conv_bwd_weights_pd, dev_ctx,
+                                        mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
     auto user_src_memory_p =
diff --git a/paddle/fluid/operators/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
new file mode 100644
index 0000000000..317d4cebe2
--- /dev/null
+++ b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
@@ -0,0 +1,299 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using framework::DataLayout;
+
+template <typename T>
+class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    const bool is_test = ctx.Attr<bool>("is_test");
+    PADDLE_ENFORCE(
+        is_test == true,
+        "ConvTransposeMKLDNN works only for inference!. Set is_test = True");
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
+    auto* output = ctx.Output<Tensor>("Output");
+
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != mkldnn::memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != mkldnn::memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
+
+    if (bias) {
+      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
+                         bias->format() != mkldnn::memory::format::format_undef,
+                     "Wrong layout/format set for Bias tensor");
+      PADDLE_ENFORCE(bias->dims().size() == 1,
+                     "Bias must only have 1 dimension, i.e. X");
+    }
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+
+    // TODO(tpatejko): add support for dilation
+    PADDLE_ENFORCE(
+        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        "dilation in convolution is not implemented yet");
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> iohw_weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> weights_tz = iohw_weights_tz;
+    // IOHW -> OIHW
+    weights_tz[0] = iohw_weights_tz[1];
+    weights_tz[1] = iohw_weights_tz[0];
+
+    // Custom Reorder from IOHW to OIHW
+    auto iohw2oihw_reorder =
+        [&iohw_weights_tz](const T* filter_data) -> std::shared_ptr<T> {
+      int o = iohw_weights_tz[1];
+      int c = iohw_weights_tz[0];
+      int h = iohw_weights_tz[2];
+      int w = iohw_weights_tz[3];
+      std::shared_ptr<T> reordered_filter_data(new T[o * c * h * w](),
+                                               std::default_delete<T[]>());
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < o; ++j) {
+          int in_offset = j * h * w + i * o * h * w;
+          int out_offset = j * c * h * w + i * h * w;
+          std::memcpy(&(reordered_filter_data.get())[out_offset],
+                      &filter_data[in_offset], h * w * sizeof(T));
+        }
+      }
+
+      return reordered_filter_data;
+    };
+
+    int g = std::max(groups, 1);
+    if (g > 1) {
+      int o = weights_tz[0];
+      int i = weights_tz[1];
+      int h = weights_tz[2];
+      int w = weights_tz[3];
+      weights_tz.resize(5);
+      weights_tz[0] = g;
+      weights_tz[1] = o / g;
+      weights_tz[2] = i;
+      weights_tz[3] = h;
+      weights_tz[4] = w;
+    }
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+
+    // Get unique name for storing MKLDNN primitives
+    const std::string key = platform::ConvTransposeMKLDNNHandler::GetHash(
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Output("Output"));
+    const std::string key_conv_transpose_pd = key + "@conv_transpose_pd";
+
+    std::vector<mkldnn::primitive> pipeline;
+
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+    auto user_weights_md =
+        platform::MKLDNNMemDesc({weights_tz}, platform::MKLDNNGetDataType<T>(),
+                                (g == 1) ? mkldnn::memory::format::oihw
+                                         : mkldnn::memory::format::goihw);
+
+    /* create memory descriptor for convolution without specified format
+     * ('any') which lets a primitive (convolution in this case) choose
+     * the memory format preferred for best performance
+     */
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
+
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
+                               // Currently used whenever bias is != nullptr.
+    auto dst_md = platform::MKLDNNMemDesc(
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+
+    // create a deconv(conv transpose) primitive descriptor and save it for
+    // usage in backward
+    std::shared_ptr<mkldnn::deconvolution_forward::primitive_desc>
+        conv_transpose_pd;
+    auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
+                                 : mkldnn::prop_kind::forward_training;
+    if (bias) {
+      bias_tz = paddle::framework::vectorize2int(bias->dims());
+      auto bias_md = platform::MKLDNNMemDesc(
+          bias_tz, platform::MKLDNNGetDataType<T>(), mkldnn::memory::format::x);
+      conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
+          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
+          fuse_relu, fwd_prop_kind);
+    } else {
+      conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
+          src_md, weights_md, dst_md, strides, paddings, mkldnn_engine,
+          fuse_relu, fwd_prop_kind);
+    }
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    if (!is_test) dev_ctx.SetBlob(key_conv_transpose_pd, conv_transpose_pd);
+
+    platform::ConvTransposeMKLDNNHandler handler(conv_transpose_pd, dev_ctx,
+                                                 mkldnn_engine, key);
+
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory_p = handler.AcquireSrcMemory(
+        user_src_md, platform::to_void_cast<T>(input_data));
+    auto user_weights_memory_p = handler.AcquireWeightsMemory(
+        user_weights_md, platform::to_void_cast<T>(filter_data),
+        is_test ? iohw2oihw_reorder : platform::user_function());
+
+    // create reorder primitive if the input format is not the preferred one
+    auto src_memory_p =
+        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
+    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
+        user_weights_memory_p, pipeline, is_test);
+
+    std::shared_ptr<mkldnn::memory> dst_memory_p;
+
+    auto output_data = output->mutable_data<T>(
+        ctx.GetPlace(), paddle::memory::Allocator::kDefault,
+        handler.GetDstMemorySize());
+    dst_memory_p = handler.AcquireDstMemoryFromPrimitive(
+        platform::to_void_cast<T>(output_data));
+
+    // create convolution op primitive
+    std::shared_ptr<mkldnn::deconvolution_forward> conv_p;
+    if (bias) {
+      const T* bias_data = bias->data<T>();
+      auto user_bias_md =
+          platform::MKLDNNMemDesc({bias_tz}, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::x);
+      auto user_bias_memory_p = handler.AcquireBiasMemory(
+          user_bias_md, platform::to_void_cast<T>(bias_data));
+
+      auto bias_memory_p =
+          handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          bias_memory_p, dst_memory_p);
+    } else {
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          dst_memory_p);
+    }
+
+    // push primitive to stream and wait until it's executed
+    pipeline.push_back(*conv_p);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+  }
+
+ private:
+  mkldnn::primitive_attr CreatePostOps(bool fuse_relu) const {
+    mkldnn::primitive_attr conv_attr;
+    mkldnn::post_ops post_operations;
+    // Fusion with ReLU layer is executed through the PostOps feature. Create a
+    // PostOps object and configure it to execute an eltwise relu operation.
+    if (fuse_relu) {
+      constexpr float scale = 1.0f;
+      constexpr float negative_slope = 0.0f;
+      constexpr float placeholder = 0.0f;
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                     negative_slope, placeholder);
+    }
+    conv_attr.set_post_ops(post_operations);
+    return conv_attr;
+  }
+
+  std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
+  ConvTransposeFwdPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& dst, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const mkldnn::engine& engine,
+      const bool fuse_relu, mkldnn::prop_kind fwd_prop_kind) const {
+    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto deconv_desc = mkldnn::deconvolution_forward::desc(
+        fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, dst,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
+
+    auto p_conv_transpose_pd =
+        new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
+                                                          deconv_attr, engine);
+
+    return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
+        p_conv_transpose_pd);
+  }
+
+  std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
+  ConvTransposeFwdPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& bias, const mkldnn::memory::desc& dst,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const mkldnn::engine& engine, const bool fuse_relu,
+      mkldnn::prop_kind fwd_prop_kind) const {
+    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto deconv_desc = mkldnn::deconvolution_forward::desc(
+        fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, bias, dst,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
+
+    auto p_conv_transpose_pd =
+        new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
+                                                          deconv_attr, engine);
+
+    return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
+        p_conv_transpose_pd);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(conv2d_transpose, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvTransposeMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index a916dd3496..2fdfc40d19 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -16,6 +16,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -78,29 +82,38 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   bool use_cudnn = ctx.Attr<bool>("use_cudnn");
   use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(ctx.GetPlace())) {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+    if (use_cudnn) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
   }
 #endif
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
+#ifdef PADDLE_WITH_MKLDNN
+  if (library_ == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
+#endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout_, library_);
 }
 
 void Conv2DTransposeOpMaker::Make() {
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution transpose operator. "
@@ -145,6 +158,11 @@ void Conv2DTransposeOpMaker::Make() {
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
       .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
@@ -238,6 +256,9 @@ void Conv3DTransposeOpMaker::Make() {
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
       .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
new file mode 100644
index 0000000000..e63d57be57
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -0,0 +1,218 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class CudnnLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "Input(Weight) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("InitH"),
+                   "Input(init_h) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("InitC"),
+                   "Input(init_c) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Cache"),
+                   "Input(Cache) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("last_h"),
+                   "Output(last_h) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("last_c"),
+                   "Output(last_c) of LSTM should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3.");
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("Input"));
+    ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH"));
+    ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC"));
+  }
+};
+
+class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "(Tensor) RNN input tensor, which support variable-time length input "
+        "sequence."
+        "The shape of the Tensor MUST be ( seq_len * batch_size * input_size)"
+        "seq_len is the total time step in this mini-batch (CAN be change in "
+        "different batch)"
+        "batch_size is the instance number of this batch"
+        "input_size is the hidden size of the input."
+        "input_hidden_size and the hidden_size in the next may not be same");
+    AddInput("InitH",
+             "(Tensor) the initial hidden state of the LSTM"
+             "input. This is a tensor with shape (num_layers x batch_size x "
+             "hidden_size)"
+             "and When is_bidirec is True, the shape will be (num_layers*2 x "
+             "batch_size x hidden_size)");
+    AddInput("InitC",
+             "(Tensor) the initial cell state of the LSTm "
+             "input. This is a tensor with shape (num_layers x batch_size x "
+             "hidden_size)"
+             "and When is_bidirec is True, the shape will be (num_layers*2 x "
+             "batch_size x hidden_size)");
+    AddInput("W",
+             "(Tensor) the learnable hidden-hidden weights."
+             " The shape is (N), where N is total weight size of the LSTM. "
+             " cudnn concatenate all the weight to one Tensor");
+    AddInput("Cache",
+             "The cache of dropout op, a RAW type variable including random "
+             "number generator states and some descriptors, which is used in "
+             "cudnn kernel.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(Tensor) the hidden state of LSTM operator. "
+              "The shape is ( seq_len x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirec is True, the shape will be ( seq_len x "
+              "batch_size x hidden_size * 2) ");
+    AddOutput("last_h",
+              "(Tensor) the hidden state of the last step. "
+              "The shape is ( num_layers x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirec is True, the shape will be (num_layers*2 x "
+              "batch_size x hidden_size)");
+    AddOutput("last_c",
+              "(Tensor) the cell state of the last step"
+              "The shape is ( num_layers x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirect is True, the shape will be (num_layers*2 x "
+              "batch_size x hidden_size*2)");
+    AddAttr<int>("max_len",
+                 "max length of the LSTM op"
+                 "the first dim of the Input can NOT be greater than max_len")
+        .SetDefault(20);
+    AddAttr<float>(
+        "dropout_prob",
+        "dropout prob of the dropout op"
+        "the dropout ONLY work between lstm layers, not between time steps"
+        "There is no dropout work on the Out tensor")
+        .SetDefault(0.0);
+    AddAttr<bool>("is_bidirec",
+                  "is_bidirec"
+                  "if it is bidirection rnn"
+                  "The will affect the shape of the Out, last_h, and last_c")
+        .SetDefault(false);
+    AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
+    AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
+    AddAttr<int>("num_layers", "the total layer number of the LSTM")
+        .SetDefault(1);
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
+    AddComment(R"DOC(
+CUDNN LSTM implementation
+
+A four-gate Long Short-Term Memory network with no peephole connections.
+In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
+
+$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
+
+$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
+
+$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
+
+$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
+
+$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
+
+$$ h_t = o_t \\odot tanh(c_t) $$
+
+- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+  of weights from the input gate to the input)
+- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
+- sigmoid is the logistic sigmoid function.
+- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+  and cell activation vectors, respectively, all of which have the same size as
+  the cell output activation vector $h$.
+- The $\odot$ is the element-wise product of the vectors.
+- `tanh` is the activation functions.
+- $\tilde{c_t}$ is also called candidate hidden state,
+  which is computed based on the current input and the previous hidden state.
+
+Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+X represensts a matrix multiplication
+
+
+)DOC");
+  }
+};
+
+class CudnnLSTMGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("last_h"),
+                   "Input(last_h) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("last_c"),
+                   "Input(last_c) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("Cache"),
+                   "Input(last_c) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("InitH"),
+                   "Input(init_h) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("InitC"),
+                   "Input(init_c) of LSTM should not be null.");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name)) {
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+      }
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("W");
+    SetOutGradDim("InitH");
+    SetOutGradDim("InitC");
+  }
+};
+
+template <typename T>
+class NotImpleKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(
+        "CPU is not support for this kernel now. Will be add in the future");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
+
+REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);
+REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel<float>);
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
new file mode 100644
index 0000000000..e01070c7b8
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -0,0 +1,485 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+struct CudnnRNNCache {
+  CudnnRNNCache() {
+    x_desc_ = NULL;
+    y_desc_ = NULL;
+    dx_desc_ = NULL;
+    dy_desc_ = NULL;
+  }
+  ~CudnnRNNCache() { release(); }
+
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnTensorDescriptor_t *x_desc_;
+  cudnnTensorDescriptor_t *y_desc_;
+  cudnnTensorDescriptor_t *dx_desc_;
+  cudnnTensorDescriptor_t *dy_desc_;
+
+  cudnnTensorDescriptor_t hx_desc_;
+  cudnnTensorDescriptor_t cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_;
+  cudnnTensorDescriptor_t cy_desc_;
+
+  cudnnTensorDescriptor_t dhx_desc_;
+  cudnnTensorDescriptor_t dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_;
+  cudnnTensorDescriptor_t dcy_desc_;
+
+  cudnnTensorDescriptor_t output_x_desc_;
+  cudnnTensorDescriptor_t output_y_desc_;
+
+  cudnnDropoutDescriptor_t dropout_desc_;
+
+  size_t weights_size_;
+  cudnnFilterDescriptor_t w_desc_;
+  cudnnFilterDescriptor_t dw_desc_;
+
+  size_t workspace_size_;
+  size_t reserve_size_;
+  Tensor reserve_data_;
+  Tensor workspace_data_;
+
+  Tensor dropout_state_;
+
+  size_t max_length_;
+
+  float dropout_prob_;
+  bool is_bidirec_;
+
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  int seed_;
+
+  void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx,
+            size_t max_len, int batch_size, int input_size, int hidden_size,
+            int num_layers, float dropout_prob, bool is_bidirec, int seed,
+            int weight_numel) {
+    max_length_ = max_len;
+    batch_size_ = batch_size;
+    input_size_ = input_size;
+    hidden_size_ = hidden_size;
+    num_layers_ = num_layers;
+    dropout_prob_ = dropout_prob;
+    is_bidirec_ = is_bidirec;
+    seed_ = seed;
+
+    x_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    y_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    int dim_a[3];
+    int stride_a[3];
+
+    for (size_t i = 0; i < max_length_; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
+      dim_a[0] = batch_size_;
+      dim_a[1] = input_size_;
+      dim_a[2] = 1;
+
+      stride_a[0] = dim_a[2] * dim_a[1];
+      stride_a[1] = dim_a[2];
+      stride_a[2] = 1;
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+
+      dim_a[0] = batch_size_;
+      dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
+      dim_a[2] = 1;
+
+      stride_a[0] = dim_a[2] * dim_a[1];
+      stride_a[1] = dim_a[2];
+      stride_a[2] = 1;
+
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    }
+
+    dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
+    dim_a[1] = batch_size_;
+    dim_a[2] = hidden_size_;
+
+    stride_a[0] = dim_a[2] * dim_a[1];
+    stride_a[1] = dim_a[2];
+    stride_a[2] = 1;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
+
+    size_t state_size;
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
+        dropout_state_.Resize({static_cast<int64_t>(state_size)}));
+    auto *dropout_state_data =
+        dropout_state_.mutable_data<uint8_t>(ctx.GetPlace());
+    CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
+        dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
+        seed_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
+
+    PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
+                      "cudnn lstm weight size should be SAME");
+    int dim_w[3];
+    dim_w[0] = weights_size_ / sizeof(float);
+    dim_w[1] = 1;
+    dim_w[2] = 1;
+    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+        w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+        dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
+        handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
+
+    reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
+    reserve_data_.mutable_data<uint8_t>(ctx.GetPlace());
+
+    workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
+    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
+  }
+
+  void release() {
+    for (size_t i = 0; i < max_length_; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
+    }
+
+    delete[] x_desc_;
+    delete[] y_desc_;
+    delete[] dx_desc_;
+    delete[] dy_desc_;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
+
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
+  }
+};
+
+template <typename T>
+class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const Tensor *x = ctx.Input<Tensor>("Input");
+    const Tensor *init_h = ctx.Input<Tensor>("InitH");
+    const Tensor *init_c = ctx.Input<Tensor>("InitC");
+
+    auto w = ctx.Input<Tensor>("W");
+
+    Tensor *out = ctx.Output<Tensor>("Out");
+    Tensor *last_h = ctx.Output<Tensor>("last_h");
+    Tensor *last_c = ctx.Output<Tensor>("last_c");
+
+    const T *x_data = x->data<T>();
+    const T *init_h_data = init_h->data<T>();
+    const T *init_c_data = init_c->data<T>();
+
+    const T *w_data = w->data<T>();
+
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());
+    T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
+    T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
+
+    size_t max_len = ctx.Attr<int>("max_len");
+    float dropout_prob = ctx.Attr<float>("dropout_prob");
+    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
+    int input_size = ctx.Attr<int>("input_size");
+    int hidden_size = ctx.Attr<int>("hidden_size");
+    int num_layers = ctx.Attr<int>("num_layers");
+    bool is_test = ctx.Attr<bool>("is_test");
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto *cache_var = ctx.InputVar("Cache");
+    if (!cache_var) {
+      // The RAW type cache variable wouldn't be created and broadcasted on
+      // multi-devices before the first running.
+      // use parent scope to make cache persistable
+      auto *scope = const_cast<framework::Scope *>(ctx.scope().parent());
+      auto cache_var_name = ctx.Inputs("Cache")[0];
+      cache_var = scope->Var(cache_var_name);
+    }
+    CudnnRNNCache *cudnn_rnn_cache = nullptr;
+    if (cache_var->IsInitialized()) {
+      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
+                            ->GetMutable<CudnnRNNCache>();
+    } else {
+      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
+                            ->GetMutable<CudnnRNNCache>();
+      std::random_device rnd;
+      int seed = ctx.Attr<int>("seed");
+      if (seed == -1) {
+        seed = rnd();
+      }
+
+      auto input_w_numel = w->numel();
+      auto batch_size = x->dims()[1];
+      cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
+                            hidden_size, num_layers, dropout_prob, is_bidirec,
+                            seed, input_w_numel);
+    }
+
+    auto run_seq_len = x->dims()[0];
+
+    if (is_test) {
+      // for inference
+      CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference(
+          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
+          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
+          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
+          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
+          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          cudnn_rnn_cache->workspace_size_));
+    } else {
+      // for train
+      CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining(
+          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
+          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
+          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
+          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
+          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          cudnn_rnn_cache->workspace_size_,
+          cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
+          cudnn_rnn_cache->reserve_size_));
+    }
+  }
+};
+
+template <typename T>
+class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *input = ctx.Input<Tensor>("Input");
+    auto *weight = ctx.Input<Tensor>("W");
+    auto *init_h = ctx.Input<Tensor>("InitH");
+    auto *init_c = ctx.Input<Tensor>("InitC");
+    // auto * last_h = ctx.Input<Tensor>("last_h");
+    // auto * last_c = ctx.Input<Tensor>("last_c");
+    auto *out = ctx.Input<Tensor>("Out");
+    auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("last_h"));
+    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("last_c"));
+
+    // auto* init_h = ctx.Input<Tensor>("init_h");
+    // auto* init_c = ctx.Input<Tensor>("init_c");
+
+    auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
+    auto *init_h_grad = ctx.Output<Tensor>(framework::GradVarName("InitH"));
+    auto *init_c_grad = ctx.Output<Tensor>(framework::GradVarName("InitC"));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto *cache_var = ctx.InputVar("Cache");
+    PADDLE_ENFORCE(cache_var->IsInitialized());
+    CudnnRNNCache *cudnn_rnn_cache =
+        const_cast<framework::Variable *>(cache_var)
+            ->GetMutable<CudnnRNNCache>();
+
+    auto input_dims = input->dims();
+    auto weight_dims = weight->dims();
+    auto init_h_dims = init_h->dims();
+    auto init_c_dims = init_c->dims();
+    in_grad->mutable_data<T>(ctx.GetPlace());
+    weight_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
+
+    T *init_h_grad_data = NULL;
+    if (init_h_grad == nullptr) {
+      Tensor init_h_grad_temp;
+      init_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
+      zero(dev_ctx, &init_h_grad_temp, static_cast<T>(0.0));
+
+      init_h_grad_data = init_h_grad_temp.data<T>();
+    } else {
+      init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
+      zero(dev_ctx, init_h_grad, static_cast<T>(0.0));
+      init_h_grad_data = init_h_grad->data<T>();
+    }
+
+    T *init_c_grad_data = NULL;
+    if (init_c_grad == nullptr) {
+      Tensor init_c_grad_temp;
+      init_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
+      zero(dev_ctx, &init_c_grad_temp, static_cast<T>(0.0));
+
+      init_c_grad_data = init_c_grad_temp.data<T>();
+    } else {
+      init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
+      zero(dev_ctx, init_c_grad, static_cast<T>(0.0));
+      init_c_grad_data = init_c_grad->data<T>();
+    }
+
+    const T *last_h_grad_data = NULL;
+    if (last_h_grad == nullptr) {
+      Tensor last_h_grad_temp;
+      last_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
+      zero(dev_ctx, &last_h_grad_temp, static_cast<T>(0.0));
+
+      last_h_grad_data = (const T *)last_h_grad_temp.data<T>();
+    } else {
+      last_h_grad_data = last_h_grad->data<T>();
+    }
+
+    const T *last_c_grad_data = NULL;
+    if (last_c_grad == nullptr) {
+      Tensor last_c_grad_temp;
+      last_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
+      zero(dev_ctx, &last_c_grad_temp, static_cast<T>(0.0));
+
+      last_c_grad_data = (const T *)last_c_grad_temp.data<T>();
+    } else {
+      last_c_grad_data = last_c_grad->data<T>();
+    }
+
+    const T *out_grad_data = NULL;
+    if (out_grad == nullptr) {
+      Tensor out_grad_temp;
+      out_grad_temp.mutable_data<T>(out->dims(), ctx.GetPlace());
+      zero(dev_ctx, &out_grad_temp, static_cast<T>(0.0));
+
+      out_grad_data = (const T *)out_grad_temp.data<T>();
+    } else {
+      out_grad_data = out_grad->data<T>();
+    }
+
+    // zero( dev_ctx, last_h_grad, static_cast<T>(0.0));
+    // zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
+
+    auto out_data = out->data<T>();
+    // auto out_grad_data = out_grad->data<T>();
+    auto weight_data = weight->data<T>();
+    auto init_h_data = init_h->data<T>();
+    auto init_c_data = init_c->data<T>();
+    auto in_grad_data = in_grad->data<T>();
+
+    auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
+    auto reserve_data = cudnn_rnn_cache->reserve_data_.data<uint8_t>();
+
+    auto run_seq_len = input_dims[0];
+    PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_,
+                      "cudnn running seq_len CAN not greater max_lengh");
+    CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData(
+        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+        cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_,
+        out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data,
+        cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_,
+        weight_data, cudnn_rnn_cache->hx_desc_, init_h_data,
+        cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_,
+        in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data,
+        cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data,
+        cudnn_rnn_cache->workspace_size_, reserve_data,
+        cudnn_rnn_cache->reserve_size_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights(
+        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+        cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_,
+        init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(),
+        cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_,
+        weight_grad->data<T>(), cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
+        cudnn_rnn_cache->reserve_size_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>);
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index 5ed8520acd..b2a2bcdce9 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -43,6 +43,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     const T* prior_box_var_data = nullptr;
     if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         T prior_box_width = prior_box_data[j * len + 2] -
@@ -96,6 +99,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     const T* prior_box_var_data = nullptr;
     if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         size_t offset = i * col * len + j * len;
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 21db93958a..36979de68f 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -9,36 +9,37 @@ else()
 endif()
 configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
 
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
 if(WITH_GRPC)
   grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
         request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
       PROTO send_recv.proto 
       DEPS lod_tensor selected_rows memory)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
   cc_test(rpc_server_test SRCS rpc_server_test.cc
     DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
   cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
-  return()
-endif()
-
-
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory)
+else()
+  set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
-    brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
+      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
+    PROTO send_recv.proto
+    DEPS lod_tensor selected_rows memory)
 
-brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc 
-    brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
-  PROTO send_recv.proto
-  DEPS lod_tensor selected_rows memory)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory)
 
-set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
+  set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
 
-cc_test(brpc_server_test SRCS rpc_server_test.cc 
-    DEPS ${brpc_test_depends} SERIAL)
+  cc_test(brpc_server_test SRCS rpc_server_test.cc
+      DEPS ${brpc_test_depends} SERIAL)
 
-cc_test(brpc_serde_test SRCS brpc_serde_test.cc 
-    DEPS ${brpc_test_depends} SERIAL)
+  cc_test(brpc_serde_test SRCS brpc_serde_test.cc
+      DEPS ${brpc_test_depends} SERIAL)
+endif()
diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc
index 47a06dd0f3..862167f020 100644
--- a/paddle/fluid/operators/distributed/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@@ -133,10 +133,10 @@ void AsyncBRPCServer::StartServer() {
 void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
 
 void AsyncBRPCServer::WaitServerReady() {
-  VLOG(30) << "AsyncGRPCServer is wait server ready";
+  VLOG(3) << "AsyncGRPCServer is wait server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(30) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
 }
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 0bd76b3f6c..d7f3ea86af 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/time.h>
 #include <limits>
 
 #include "glog/logging.h"  // For VLOG
@@ -20,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/grpc_client.h"
 #include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(rpc_disable_reuse_port);
@@ -40,7 +40,7 @@ void GRPCClient::SendComplete() {
   std::unique_lock<std::mutex> lk(completed_mutex_);
   if (!completed_) {
     for (auto& it : channels_) {
-      VLOG(30) << "send complete message to " << it.first;
+      VLOG(3) << "send complete message to " << it.first;
       this->AsyncSendComplete(it.first);
     }
     PADDLE_ENFORCE(this->Wait(), "internal grpc error");
@@ -83,7 +83,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
     ::grpc::ByteBuffer req;
     SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
 
-    VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
     s->response_call_back_ = nullptr;
@@ -144,7 +144,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
     ::grpc::ByteBuffer buf;
     RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-    VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
     s->response_call_back_ = ProcGetResponse;
@@ -171,11 +171,13 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
                                           const framework::Scope& scope,
                                           const std::string& in_var_name,
                                           const std::string& out_var_name,
+                                          const std::string& table_name,
                                           int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string in_var_name_val = in_var_name;
   const std::string out_var_name_val = out_var_name;
+  const std::string table_name_val = table_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
@@ -186,13 +188,14 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   s->Prepare(h, time_out);
 
   framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      s, method, h, this] {
+                      s, method, h, table_name_val, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val,
+                          0, table_name_val);
 
-    VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
     // stub context
     s->response_call_back_ = ProcGetResponse;
@@ -330,14 +333,14 @@ void GRPCClient::Proceed() {
   void* tag = nullptr;
   bool ok = false;
 
-  VLOG(30) << "GRPCClient Proceed begin";
+  VLOG(3) << "GRPCClient Proceed begin";
   while (!stopped_ && cq_.Next(&tag, &ok)) {
     BaseProcessor* c = static_cast<BaseProcessor*>(tag);
     GPR_ASSERT(ok);
     PADDLE_ENFORCE(c);
 
     if (c->status_.ok()) {
-      VLOG(30) << c->GetVarHandlePtr()->String() << " process";
+      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
       c->Process();
     } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
       // FIXME(gongwb): parse error_details?
@@ -372,7 +375,7 @@ void GRPCClient::Proceed() {
       sync_cond_.notify_all();
     }
   }
-  VLOG(30) << "GRPCClient Proceed end";
+  VLOG(3) << "GRPCClient Proceed end";
 }
 
 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index d8e9cee85b..a31a465645 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -194,6 +194,7 @@ class GRPCClient : public RPCClient {
                                 const framework::Scope& scope,
                                 const std::string& in_var_name,
                                 const std::string& out_var_name,
+                                const std::string& table_name = "",
                                 int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncSendBatchBarrier(
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
index f27b70a5a3..31fac2133c 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
-#include <sys/time.h>
 #include <thread>  // NOLINT
 
 #include "google/protobuf/io/coded_stream.h"
@@ -26,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -42,7 +42,8 @@ static void SerializeDestroyCallback(void* payload) {
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg, const std::string& out_name,
-                           const int trainer_id) {
+                           const int trainer_id,
+                           const std::string& table_name) {
   platform::RecordRPCEvent record_event("serial", &ctx);
   VarMsg request;
   TensorPayload* payload = nullptr;
@@ -63,6 +64,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   if (!out_name.empty()) {
     request.set_out_varname(out_name);
   }
+  if (!table_name.empty()) {
+    request.set_table_name(table_name);
+  }
   if (var->IsType<framework::LoDTensor>()) {
     request.set_type(::sendrecv::LOD_TENSOR);
     payload = new TensorPayload(GetTensorPayload(var, ctx, &request));
diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h
index 7ec489e961..16f5293b0e 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <sys/time.h>
+
 #include <iostream>
 #include <string>
 #include <vector>
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
@@ -39,7 +40,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg,
                            const std::string& out_varname = std::string(),
-                           const int trainer_id = 0);
+                           const int trainer_id = 0,
+                           const std::string& table_name = std::string());
 
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc
index 96ea05e74e..1936c2c623 100644
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -130,7 +130,8 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   math::set_constant(ctx, tensor, 31.9);
 
   ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg,
+                                                "outvar", 0, "table_name");
   EXPECT_GT(msg.Length(), static_cast<size_t>(0));
 
   // deserialize
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index 77bf67be25..d9200c98b2 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -100,7 +100,7 @@ class RequestSend final : public RequestBase {
 
   void Process() override {
     std::string varname = GetReqName();
-    VLOG(40) << "RequestSend var_name:" << varname;
+    VLOG(4) << "RequestSend var_name:" << varname;
 
     auto scope = request_->GetMutableLocalScope();
     auto invar = request_->GetVar();
@@ -137,7 +137,7 @@ class RequestGet final : public RequestBase {
     // proc request.
     std::string varname = request_.varname();
     int trainer_id = request_.trainer_id();
-    VLOG(40) << "RequestGet " << varname;
+    VLOG(4) << "RequestGet " << varname;
 
     auto scope = request_handler_->scope();
     auto invar = scope->FindVar(varname);
@@ -183,9 +183,10 @@ class RequestPrefetch final : public RequestBase {
     // prefetch process...
     std::string in_var_name = request_->Varname();
     std::string out_var_name = request_->OutVarname();
+    std::string table_name = request_->TableName();
     int trainer_id = request_->GetTrainerId();
-    VLOG(40) << "RequestPrefetch, in_var_name: " << in_var_name
-             << " out_var_name: " << out_var_name;
+    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name;
 
     auto scope = request_->GetMutableLocalScope();
     auto invar = scope->FindVar(in_var_name);
@@ -193,7 +194,7 @@ class RequestPrefetch final : public RequestBase {
     framework::Variable* outvar = scope->Var(out_var_name);
 
     request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                             out_var_name);
+                             out_var_name, table_name);
 
     SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
                           &reply_);
@@ -233,8 +234,8 @@ class RequestCheckpointNotify final : public RequestBase {
     std::string checkpoint_dir = request_->OutVarname();
     int trainer_id = request_->GetTrainerId();
 
-    VLOG(40) << "RequestCheckpointNotify notify: " << checkpoint_notify
-             << ", dir: " << checkpoint_dir;
+    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
+            << ", dir: " << checkpoint_dir;
 
     request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
                              trainer_id, checkpoint_dir);
@@ -248,10 +249,10 @@ class RequestCheckpointNotify final : public RequestBase {
 };
 
 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(40) << "AsyncGRPCServer is wait server ready";
+  VLOG(4) << "AsyncGRPCServer is wait server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(40) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
 }
 
 // Define an option subclass in order to disable SO_REUSEPORT for the
@@ -302,15 +303,14 @@ void AsyncGRPCServer::StartServer() {
     reqs.reserve(kRequestBufSize);
 
     for (int i = 0; i < kRequestBufSize; i++) {
-      VLOG(60) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
-               << " I: " << i;
+      VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
       TryToRegisterNewOne(rpc_name, i);
     }
 
     for (int i = 0; i < threadnum; i++) {
       rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
           &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(40) << t.first << " creates threads!";
+      VLOG(4) << t.first << " creates threads!";
     }
   }
 
@@ -327,7 +327,7 @@ void AsyncGRPCServer::StartServer() {
     auto& threads = t.second;
     for (size_t i = 0; i < threads.size(); ++i) {
       threads[i]->join();
-      VLOG(40) << t.first << " threads ends!";
+      VLOG(4) << t.first << " threads ends!";
     }
   }
 }
@@ -335,7 +335,7 @@ void AsyncGRPCServer::StartServer() {
 void AsyncGRPCServer::ShutdownQueue() {
   for (auto& t : rpc_cq_) {
     t.second->Shutdown();
-    VLOG(40) << t.first << " queue shutdown!";
+    VLOG(4) << t.first << " queue shutdown!";
   }
 }
 
@@ -344,7 +344,7 @@ void AsyncGRPCServer::ShutDownImpl() {
   is_shut_down_ = true;
   ShutdownQueue();
 
-  VLOG(40) << "server_ shutdown!";
+  VLOG(4) << "server_ shutdown!";
   server_->Shutdown();
 }
 
@@ -352,12 +352,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
                                           int req_id) {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
-    VLOG(40) << "shutdown, do not TryToRegisterNewSendOne";
+    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
 
-  VLOG(40) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
-           << " REQ ID: " << req_id;
+  VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
+          << " REQ ID: " << req_id;
 
   auto& reqs = rpc_reqs_[rpc_name];
   auto& handler = rpc_call_map_[rpc_name];
@@ -378,7 +378,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
 
   reqs[req_id] = b;
 
-  VLOG(40) << "Create RequestSend status:" << b->Status();
+  VLOG(4) << "Create RequestSend status:" << b->Status();
 }
 
 void AsyncGRPCServer::HandleRequest(
@@ -388,15 +388,15 @@ void AsyncGRPCServer::HandleRequest(
   bool ok = false;
 
   while (true) {
-    VLOG(40) << "HandleRequest " << rpc_name << " wait next";
+    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
     if (!cq->Next(&tag, &ok)) {
-      VLOG(30) << "CompletionQueue " << rpc_name << " shutdown!";
+      VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
       break;
     }
 
     int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(40) << "HandleRequest " << rpc_name << ", req_id:" << req_id
-             << " get next";
+    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+            << " get next";
 
     auto& reqs = rpc_reqs_[rpc_name];
     RequestBase* base = nullptr;
@@ -406,7 +406,7 @@ void AsyncGRPCServer::HandleRequest(
       base = reqs[req_id];
     }
 
-    VLOG(30) << base->Status2String(rpc_name);
+    VLOG(3) << base->Status2String(rpc_name);
 
     // reference:
     // https://github.com/tensorflow/tensorflow/issues/5596
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc_variable_response.cc
index d6d219d436..76ad02b030 100644
--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
@@ -301,6 +301,20 @@ int GRPCVariableResponse::Parse(Source* source) {
         meta_.set_trainer_id(trainer_id);
         break;
       }
+      case sendrecv::VariableMessage::kTableNameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_table_name(temp);
+        break;
+      }
       default: {
         // Unknown tag, return unknown error.
         return -1;
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
new file mode 100644
index 0000000000..cf14538b1c
--- /dev/null
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -0,0 +1,255 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+static size_t GetSectionIndex(int64_t id,
+                              const std::vector<int64_t>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (id < abs_sections[i]) {
+      return i - 1;
+    }
+  }
+  return abs_sections.size() - 1;
+}
+
+static std::vector<int64_t> ToAbsoluteSection(
+    const std::vector<int>& height_sections) {
+  std::vector<int64_t> abs_sections;
+  abs_sections.resize(height_sections.size());
+  abs_sections[0] = 0;
+  for (size_t i = 1; i < height_sections.size(); ++i) {
+    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
+  }
+  return abs_sections;
+}
+
+static std::vector<std::vector<int64_t>> SplitIds(
+    const std::vector<int64_t>& ids_vector,
+    const std::vector<int>& height_section, framework::Scope* scope) {
+  std::set<int64_t> all_ids;
+  for (auto id : ids_vector) {
+    all_ids.insert(id);
+  }
+
+  auto abs_sections = ToAbsoluteSection(height_section);
+  std::vector<std::vector<int64_t>> splited_ids;
+  splited_ids.resize(height_section.size() + 1);
+  for (auto& id : all_ids) {
+    auto section_index = GetSectionIndex(id, abs_sections);
+    splited_ids[section_index].push_back(id - abs_sections[section_index]);
+  }
+  return splited_ids;
+}
+
+static void SplitIdsIntoMultipleVarsBySection(
+    const std::vector<std::string>& in_var_names,
+    const std::vector<int>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");
+
+  auto place = platform::CPUPlace();
+
+  for (size_t i = 0; i < in_var_names.size(); ++i) {
+    auto* id_tensor =
+        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
+    auto& ids = splited_ids[i];
+    if (!ids.empty()) {
+      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
+    }
+  }
+}
+
+static void MergeMultipleVarsIntoOneBySection(
+    const std::string& id_name, const std::vector<int64_t>& ids_vector,
+    const std::string& out_name, const std::vector<std::string>& out_var_names,
+    const std::vector<int>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    const framework::ExecutionContext& context, framework::Scope* scope,
+    platform::DeviceContext* actual_ctx) {
+  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), "");
+
+  auto cpu_place = platform::CPUPlace();
+
+  auto abs_sections = ToAbsoluteSection(height_section);
+  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
+  for (size_t i = 0; i < ids_vector.size(); ++i) {
+    id_to_offset[ids_vector[i]].push_back(i);
+  }
+
+  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
+  auto* out_tensor =
+      scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
+  auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());
+
+  bool is_on_cpu_place = true;
+  if (!platform::is_cpu_place(id_tensor.place())) {
+    is_on_cpu_place = false;
+  }
+
+  for (size_t section_idx = 0; section_idx < out_var_names.size();
+       ++section_idx) {
+    auto& ids_in_this_section = splited_ids[section_idx];
+    if (!ids_in_this_section.empty()) {
+      auto& prefetch_out_var =
+          scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
+      const auto* out_var_data = prefetch_out_var.data<float>();
+      auto& dims = prefetch_out_var.dims();
+
+      PADDLE_ENFORCE_EQ(dims.size(), 2, "");
+      PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
+
+      auto row_numel = dims[1];
+
+      for (size_t i = 0; i < dims[0]; ++i) {
+        auto id = ids_in_this_section[i];
+        auto origin_id = id + abs_sections[section_idx];
+        auto& offsets = id_to_offset[origin_id];
+        for (auto& offset : offsets) {
+          // should support GPU tensor
+          if (is_on_cpu_place) {
+            memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
+                         cpu_place, out_var_data + i * row_numel,
+                         sizeof(float) * row_numel);
+          } else {
+#ifndef PADDLE_WITH_CUDA
+            PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+            auto stream =
+                static_cast<platform::CUDADeviceContext*>(actual_ctx)->stream();
+            memory::Copy(boost::get<platform::CUDAPlace>(id_tensor.place()),
+                         out_tensor_data + offset * row_numel, cpu_place,
+                         out_var_data + i * row_numel,
+                         sizeof(float) * row_numel, stream);
+#endif
+          }
+        }
+      }
+    } else {
+      VLOG(3) << "ids in this section is empty";
+    }
+  }
+}
+
+void prefetch(const std::string& id_name, const std::string& out_name,
+              const std::vector<std::string>& table_names,
+              const std::vector<std::string>& epmap,
+              const std::vector<int>& height_sections,
+              const framework::ExecutionContext& context) {
+  auto& local_scope = context.scope().NewScope();
+
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& cpu_ctx = *pool.Get(platform::CPUPlace());
+  auto& actual_ctx = *pool.Get(context.GetPlace());
+
+  distributed::RPCClient* rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+          context.Attr<int>("trainer_id"));
+
+  std::vector<std::string> in_var_names;
+  std::vector<std::string> out_var_names;
+  for (size_t i = 0; i < epmap.size(); ++i) {
+    in_var_names.push_back(id_name + "@" + epmap[i]);
+    out_var_names.push_back(out_name + "@" + epmap[i]);
+  }
+
+  auto& id_tensor = local_scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  std::vector<int64_t> ids_vector;
+  if (platform::is_cpu_place(id_tensor.place())) {
+    auto* id_data = id_tensor.data<int64_t>();
+    for (size_t i = 0; i < id_tensor.numel(); ++i) {
+      ids_vector.push_back(id_data[i]);
+    }
+  } else {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+    auto cpu_place = platform::CPUPlace();
+    framework::Tensor cpu_tensor;
+    auto* cpu_tensor_data =
+        cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
+    auto stream =
+        static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
+    memory::Copy(cpu_place, cpu_tensor_data,
+                 boost::get<platform::CUDAPlace>(id_tensor.place()),
+                 id_tensor.data<int64_t>(), sizeof(int64_t) * id_tensor.numel(),
+                 stream);
+    for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
+      ids_vector.push_back(cpu_tensor_data[i]);
+    }
+#endif
+  }
+
+  auto splited_ids = SplitIds(ids_vector, height_sections, &local_scope);
+  SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
+                                    &local_scope);
+
+  // create output var in local scope
+  for (auto& name : out_var_names) {
+    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
+  }
+
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < in_var_names.size(); i++) {
+    if (NeedSend(local_scope, in_var_names[i])) {
+      VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]
+              << " to get " << out_var_names[i] << " back";
+      rets.push_back(rpc_client->AsyncPrefetchVar(
+          epmap[i], cpu_ctx, local_scope, in_var_names[i], out_var_names[i],
+          table_names[i]));
+    } else {
+      VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
+    }
+  }
+
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+  }
+
+  MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
+                                    out_var_names, height_sections, splited_ids,
+                                    context, &local_scope, &actual_ctx);
+
+  context.scope().DeleteScope(&local_scope);
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
new file mode 100644
index 0000000000..53b0fbfb51
--- /dev/null
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -0,0 +1,34 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void prefetch(const std::string& id_name, const std::string& out_name,
+              const std::vector<std::string>& table_names,
+              const std::vector<std::string>& epmap,
+              const std::vector<int>& height_sections,
+              const framework::ExecutionContext& context);
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 3bcc59a47b..5272afd428 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -75,7 +75,7 @@ class VarHandle {
       wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
       ret = status_;
     }
-    VLOG(70) << "VarHandle wait:" << ret;
+    VLOG(7) << "VarHandle wait:" << ret;
     return ret != kErrorState;
   }
 
@@ -84,7 +84,7 @@ class VarHandle {
       std::unique_lock<std::mutex> lk(sync_mutex_);
       status_ = ok ? kFinishState : kErrorState;
     }
-    VLOG(70) << "VarHandle finish:" << ok;
+    VLOG(7) << "VarHandle finish:" << ok;
     wait_cond_.notify_all();
   }
 
@@ -191,7 +191,8 @@ class RequestHandler {
   virtual bool Handle(const std::string& varname, framework::Scope* scope,
                       framework::Variable* var, framework::Variable** outvar,
                       const int trainer_id,
-                      const std::string& out_var_name = "") = 0;
+                      const std::string& out_var_name = "",
+                      const std::string& table_name = "") = 0;
 
  protected:
   const bool sync_mode_;
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index dae56cc843..9722f8c96e 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include <iostream>
 #include <string>
 #include <vector>
@@ -20,7 +21,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/string/printf.h"
 
@@ -37,20 +38,21 @@ bool RequestSendHandler::Handle(const std::string& varname,
                                 framework::Variable* invar,
                                 framework::Variable** outvar,
                                 const int trainer_id,
-                                const std::string& out_var_name) {
-  VLOG(40) << "RequestSendHandler:" << varname;
+                                const std::string& out_var_name,
+                                const std::string& table_name) {
+  VLOG(4) << "RequestSendHandler:" << varname;
 
   // Sync
   if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(30) << "sync: recv BATCH_BARRIER_MESSAGE";
+    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
     rpc_server_->IncreaseBatchBarrier(kRequestSend);
   } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(30) << "sync: recv complete message";
+    VLOG(3) << "sync: recv complete message";
     rpc_server_->Complete();
   } else {
     // Async
     if (!sync_mode_) {
-      VLOG(30) << "async process var: " << varname;
+      VLOG(3) << "async process var: " << varname;
       try {
         executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
                                       scope);
@@ -61,7 +63,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
       return true;
     } else {  // sync
       rpc_server_->WaitCond(kRequestSend);
-      VLOG(30) << "sync: processing received var: " << varname;
+      VLOG(3) << "sync: processing received var: " << varname;
 
       if (invar == nullptr) {
         LOG(FATAL) << "sync: Can not find server side var: " << varname;
@@ -77,11 +79,13 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Variable* invar,
                                framework::Variable** outvar,
                                const int trainer_id,
-                               const std::string& out_var_name) {
-  VLOG(40) << "RequestGetHandler:" << varname;
+                               const std::string& out_var_name,
+                               const std::string& table_name) {
+  VLOG(4) << "RequestGetHandler:" << varname;
+
   if (sync_mode_) {
     if (varname == FETCH_BARRIER_MESSAGE) {
-      VLOG(30) << "sync: recv fetch barrier message";
+      VLOG(3) << "sync: recv fetch barrier message";
       rpc_server_->IncreaseBatchBarrier(kRequestGet);
     } else {
       rpc_server_->WaitCond(kRequestGet);
@@ -93,14 +97,13 @@ bool RequestGetHandler::Handle(const std::string& varname,
         // NOTE: the format is determined by distributed_transpiler.py
         std::string param_bak_name =
             string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
-        VLOG(30) << "getting " << param_bak_name << " trainer_id "
-                 << trainer_id;
+        VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
         auto var = scope_->FindVar(varname);
         auto t_orig = var->Get<framework::LoDTensor>();
         auto param_bak = scope_->Var(param_bak_name);
         auto t = param_bak->GetMutable<framework::LoDTensor>();
         t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
-        VLOG(30) << "copying " << varname << " to " << param_bak_name;
+        VLOG(3) << "copying " << varname << " to " << param_bak_name;
         framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
       }
       *outvar = scope_->FindVar(varname);
@@ -114,14 +117,22 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
                                     framework::Variable* invar,
                                     framework::Variable** outvar,
                                     const int trainer_id,
-                                    const std::string& out_var_name) {
-  VLOG(40) << "RequestPrefetchHandler " << varname;
-
-  auto var_desc = program_->Block(0).FindVar(out_var_name);
-  InitializeVariable(*outvar, var_desc->GetType());
-  executor_->RunPreparedContext(
-      (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
+                                    const std::string& out_var_name,
+                                    const std::string& table_name) {
+  VLOG(4) << "RequestPrefetchHandler " << varname;
 
+  if (table_name.empty()) {
+    auto var_desc = program_->Block(0).FindVar(out_var_name);
+    InitializeVariable(*outvar, var_desc->GetType());
+    executor_->RunPreparedContext(
+        (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
+  } else {
+    (*outvar)->GetMutable<framework::LoDTensor>();
+    auto lookup_table_op =
+        BuildLookupTableOp(table_name, varname, out_var_name);
+    paddle::platform::CPUPlace cpu_place;
+    lookup_table_op->Run(*scope, cpu_place);
+  }
   return true;
 }
 
@@ -130,7 +141,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
                                       framework::Variable* invar,
                                       framework::Variable** outvar,
                                       const int trainer_id,
-                                      const std::string& out_var_name) {
+                                      const std::string& out_var_name,
+                                      const std::string& table_name) {
   PADDLE_ENFORCE(
       checkpoint_notify_id != -1,
       "when checkpoint_notify_id = -1, there should be no RPC invoke.");
@@ -139,8 +151,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
   auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
   lt_var->clear();
   lt_var->append(out_var_name);
-  VLOG(40) << "RequestCheckpointHandler update var kLookupTablePath to: "
-           << out_var_name;
+  VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
+          << out_var_name;
   executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
   return true;
 }
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index c1afda9dd2..5e0b25c5c2 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -24,6 +24,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -43,8 +44,8 @@ class RequestSendHandler final : public RequestHandler {
   virtual ~RequestSendHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
 
  private:
   bool enable_dc_asgd_;
@@ -59,21 +60,44 @@ class RequestGetHandler final : public RequestHandler {
   virtual ~RequestGetHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
 
  private:
   bool enable_dc_asgd_;
 };
 
+static inline void BuildVar(const std::string& param_name,
+                            std::initializer_list<const char*> arguments,
+                            paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
 class RequestPrefetchHandler final : public RequestHandler {
  public:
   explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
   virtual ~RequestPrefetchHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
+
+ private:
+  std::unique_ptr<paddle::framework::OperatorBase> BuildLookupTableOp(
+      const std::string& table_name, const std::string& id_name,
+      const std::string& out_name) {
+    paddle::framework::proto::OpDesc op_desc;
+    op_desc.set_type("lookup_table");
+    BuildVar("W", {table_name.data()}, op_desc.add_inputs());
+    BuildVar("Ids", {id_name.data()}, op_desc.add_inputs());
+    BuildVar("Out", {out_name.data()}, op_desc.add_outputs());
+
+    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+    return op;
+  }
 };
 
 class RequestCheckpointHandler final : public RequestHandler {
@@ -85,8 +109,8 @@ class RequestCheckpointHandler final : public RequestHandler {
   virtual ~RequestCheckpointHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
 
  private:
   int checkpoint_notify_id;
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 1983802e49..4cd3abb5a6 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -48,7 +48,7 @@ class RPCClient {
   virtual VarHandlePtr AsyncPrefetchVar(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& in_var_name,
-      const std::string& out_var_name,
+      const std::string& out_var_name, const std::string& table_name = "",
       int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   virtual VarHandlePtr AsyncSendBatchBarrier(
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 4055091104..3e30ed4ac8 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -39,7 +39,7 @@ void RPCServer::SavePort() const {
   port_file.open(file_path);
   port_file << selected_port_;
   port_file.close();
-  VLOG(40) << "selected port written to " << file_path;
+  VLOG(4) << "selected port written to " << file_path;
 }
 
 void RPCServer::WaitBarrier(const std::string& rpc_name) {
@@ -49,12 +49,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
             exit_flag_.load());
   });
 
-  VLOG(30) << "batch_barrier_: " << rpc_name << " "
-           << barrier_counter_[rpc_name];
+  VLOG(3) << "batch_barrier_: " << rpc_name << " "
+          << barrier_counter_[rpc_name];
 }
 
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(40) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
   int b = 0;
   std::unique_lock<std::mutex> lock(mutex_);
   b = ++barrier_counter_[rpc_name];
@@ -71,7 +71,7 @@ void RPCServer::Complete() {
     client_num_--;
     need_reset_all_vars_ = true;
 
-    VLOG(40) << "decrease client_num to: " << client_num_;
+    VLOG(4) << "decrease client_num to: " << client_num_;
     if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
       barrier_counter_[kRequestGet]--;
     }
@@ -90,7 +90,7 @@ int RPCServer::GetClientNum() {
 }
 
 void RPCServer::ResetBarrierCounter() {
-  VLOG(30) << "RPCServer ResetBarrierCounter ";
+  VLOG(3) << "RPCServer ResetBarrierCounter ";
   std::unique_lock<std::mutex> lock(mutex_);
   for (auto& t : barrier_counter_) {
     t.second = 0;
@@ -105,12 +105,12 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
 
   static int cond = -1;
   rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(40) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
-           << ", cond:" << rpc_cond_map_[rpc_name];
+  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
+          << ", cond:" << rpc_cond_map_[rpc_name];
 }
 
 void RPCServer::SetCond(const std::string& rpc_name) {
-  VLOG(30) << "RPCServer SetCond " << rpc_name;
+  VLOG(3) << "RPCServer SetCond " << rpc_name;
   {
     std::unique_lock<std::mutex> lock(mutex_);
     cur_cond_ = rpc_cond_map_[rpc_name];
@@ -120,7 +120,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }
 
 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(40) << "RPCServer WaitCond " << rpc_name;
+  VLOG(4) << "RPCServer WaitCond " << rpc_name;
   int cond = 0;
   {
     std::unique_lock<std::mutex> lock(mutex_);
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index 55820c980e..7b7d069f17 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -80,6 +80,7 @@ message VariableMessage {
   // when profile switches from 1 to 2.
   int64 profile = 11;
   int64 trainer_id = 12;
+  string table_name = 13;
 }
 
 message VoidMessage {}
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index df5af3476f..6ba883ba01 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -15,12 +15,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
-#include <sys/time.h>
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/platform/port.h"
 
 DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
 
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index 480fc59c42..523e56fe3e 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <sys/time.h>
 #include <iostream>
 #include <string>
 #include <vector>
@@ -24,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/port.h"
 
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index f831793e9b..5b2be04e6a 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -50,7 +50,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
         size_to_write = length - total_written;
       }
       // This log is useful to see how long a internal block size is of rpc.
-      VLOG(70) << "copy " << size_to_write << " data to CUDAPlace";
+      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
       memory::Copy(boost::get<platform::CUDAPlace>(place),
                    reinterpret_cast<void*>(p), cpu, data, size_to_write,
                    gpu_dev_ctx.stream());
@@ -79,7 +79,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
     // TODO(gongwb): can we avoid copy?
     platform::CPUPlace cpu;
     // This log is useful to see how long a internal block size is of rpc.
-    VLOG(70) << "copy " << size_to_write << " data to CPUPlace";
+    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
     memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
 
     p += size_to_write;
@@ -198,8 +198,8 @@ bool VariableResponse::ProcSerializedField(
 #endif
   }
 
-  VLOG(70) << "ProcSerializedField:" << meta_.varname()
-           << ", type:" << meta_.type() << std::endl;
+  VLOG(7) << "ProcSerializedField:" << meta_.varname()
+          << ", type:" << meta_.type() << std::endl;
   framework::DDim dims = GetDims(meta_.dims());
   if (meta_.type() == sendrecv::LOD_TENSOR) {
     PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index 4c7fcbbdfb..a4324f67bb 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -85,6 +85,7 @@ class VariableResponse {
   inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
   inline std::string Varname() const { return meta_.varname(); }
   inline std::string OutVarname() const { return meta_.out_varname(); }
+  inline std::string TableName() const { return meta_.table_name(); }
 
   // should call parse first.
   framework::Variable* GetVar() {
diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
index ed4dced513..a3b5ff8d17 100644
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
@@ -46,8 +46,8 @@ class CheckpointNotifyOp : public framework::OperatorBase {
       auto lookup_table_save_dir =
           string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
       rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir);
-      VLOG(30) << "checkpoint notify sending lookup table: "
-               << lookup_table_name << " and dir:" << dir << " to " << epmap[i];
+      VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
+              << " and dir:" << dir << " to " << epmap[i];
     }
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
   }
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
index 88a5e59ce7..8754856e14 100644
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -43,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase {
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
 
     for (auto& ep : eps) {
-      VLOG(30) << "fetch barrier, ep: " << ep;
+      VLOG(3) << "fetch barrier, ep: " << ep;
       rpc_client->AsyncSendFetchBarrier(ep);
     }
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
index 56ea165ff8..ef574ccdf4 100644
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
@@ -64,7 +64,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
 
     for (auto& ep : endpoint_list) {
-      VLOG(30) << "sending nccl id to " << ep;
+      VLOG(3) << "sending nccl id to " << ep;
       client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
     }
     client->Wait();
@@ -72,7 +72,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
       client->AsyncSendBatchBarrier(ep);
     }
     client->Wait();
-    VLOG(30) << "sending completed...";
+    VLOG(3) << "sending completed...";
   }
 
   void GetIdByServer(framework::Scope* scope,
@@ -99,11 +99,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
         std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
 
     rpc_service->SetCond(distributed::kRequestSend);
-    VLOG(30) << "start getting nccl id from trainer 0...";
+    VLOG(3) << "start getting nccl id from trainer 0...";
     rpc_service->WaitBarrier(distributed::kRequestSend);
-    VLOG(30) << "got nccl id and stop server...";
+    VLOG(3) << "got nccl id and stop server...";
     rpc_service->ShutDown();
-    VLOG(30) << "rpc server stopped";
+    VLOG(3) << "rpc server stopped";
     server_thread.join();
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 9f0c7db0e1..ab92ad4506 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -36,7 +36,7 @@ namespace operators {
 
 void RunServer(std::shared_ptr<distributed::RPCServer> service) {
   service->StartServer();
-  VLOG(40) << "RunServer thread end";
+  VLOG(4) << "RunServer thread end";
 }
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
@@ -66,8 +66,8 @@ static void ParallelExecuteBlocks(
     fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
       int run_block = idx;  // thread local
       try {
-        VLOG(30) << "running server block: " << run_block
-                 << "pointer: " << prepared[run_block].get();
+        VLOG(3) << "running server block: " << run_block
+                << "pointer: " << prepared[run_block].get();
         executor->RunPreparedContext(prepared[run_block].get(), scope);
       } catch (const std::exception &e) {
         LOG(FATAL) << "run sub program:" << idx << " error " << e.what();
@@ -108,7 +108,7 @@ void ListenAndServOp::RunSyncLoop(
     framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
     const std::vector<int> &prefetch_block_id_list,
     const int checkpoint_point_block_id) const {
-  VLOG(20) << "RunSyncLoop";
+  VLOG(2) << "RunSyncLoop";
   size_t num_blocks = program->Size();
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -167,7 +167,7 @@ void ListenAndServOp::RunSyncLoop(
     }
     ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
                           recv_scope);
-    VLOG(20) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
+    VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
     ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
 
@@ -183,11 +183,11 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
   for (auto &varname : sparse_vars_) {
     auto var = recv_scope->FindVar(varname);
     if (var == nullptr) {
-      VLOG(20) << "can not find var " << varname << " in received scope";
+      VLOG(2) << "can not find var " << varname << " in received scope";
       continue;
     }
     if (var->IsType<framework::SelectedRows>()) {
-      VLOG(30) << "reset sparse var: " << varname;
+      VLOG(3) << "reset sparse var: " << varname;
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     } else {
       PADDLE_THROW("The type of sparse var should be SelectedRows");
@@ -197,7 +197,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
     for (auto &varname : dense_vars_) {
       auto var = recv_scope->FindVar(varname);
       if (var == nullptr) {
-        VLOG(20) << "can not find var " << varname << " in received scope";
+        VLOG(2) << "can not find var " << varname << " in received scope";
         continue;
       }
       if (var->IsType<framework::LoDTensor>()) {
@@ -216,7 +216,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                    framework::ProgramDesc *program,
                                    framework::Scope *recv_scope) const {
-  VLOG(20) << "RunAsyncLoop";
+  VLOG(2) << "RunAsyncLoop";
   auto grad_to_block_id_str =
       Attr<std::vector<std::string>>("grad_to_block_id");
   DoubleFindMap<std::string, int32_t> grad_to_block_id;
@@ -225,7 +225,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                               const std::string &grad_and_id) {
     std::vector<std::string> pieces;
     split(grad_and_id, ':', &pieces);
-    VLOG(30) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
+    VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
     PADDLE_ENFORCE_EQ(pieces.size(), 2);
     PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
 
@@ -270,7 +270,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 
   while (true) {
     if (rpc_service_->IsExit()) {
-      VLOG(40) << "get exit!rpc_processor break!";
+      VLOG(4) << "get exit!rpc_processor break!";
       break;
     }
 
@@ -332,9 +332,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   std::string endpoint = Attr<std::string>("endpoint");
   int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
 
-  VLOG(40) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
-           << ", end_point:" << endpoint
-           << ", checkpoint_block_id: " << checkpoint_block_id;
+  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
+          << ", end_point:" << endpoint
+          << ", checkpoint_block_id: " << checkpoint_block_id;
 
   rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
 
@@ -383,8 +383,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
        prefetch_var_name_to_block_id_str) {
     std::vector<std::string> pieces;
     split(prefetch_var_name_and_id, ':', &pieces);
-    VLOG(30) << "after split, prefetch_var = " << pieces[0]
-             << ", id=" << pieces[1];
+    VLOG(3) << "after split, prefetch_var = " << pieces[0]
+            << ", id=" << pieces[1];
     PADDLE_ENFORCE_EQ(pieces.size(), 2);
 
     int block_id = std::stoi(pieces[1]);
@@ -415,7 +415,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
-  VLOG(30) << "wait server thread to become ready...";
+  VLOG(3) << "wait server thread to become ready...";
   rpc_service_->WaitServerReady();
 
   // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
index faa67a28d8..86425aba8c 100644
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
@@ -48,12 +48,12 @@ class PrefetchOp : public framework::OperatorBase {
     std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
-        VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get "
-                 << outs[i] << " back";
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
+                << outs[i] << " back";
         rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
                                                     ins[i], outs[i]));
       } else {
-        VLOG(30) << "don't send no-initialied variable: " << ins[i];
+        VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
     for (size_t i = 0; i < rets.size(); i++) {
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index fbbd86502b..0399ff4100 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -47,7 +47,7 @@ class RecvOp : public framework::OperatorBase {
 
     std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(30) << "getting " << outs[i] << " from " << epmap[i];
+      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
       rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
     }
     if (sync_mode) {
diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
index 02ca107ca3..8ca2877d8a 100644
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
@@ -42,12 +42,12 @@ class SendBarrierOp : public framework::OperatorBase {
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(
             Attr<int>("trainer_id"));
 
-    VLOG(30) << "SendBarrierOp sync";
+    VLOG(3) << "SendBarrierOp sync";
 
     // need to wait before sending send_barrier message
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
     for (auto& ep : eps) {
-      VLOG(30) << "send barrier, ep: " << ep;
+      VLOG(3) << "send barrier, ep: " << ep;
       rpc_client->AsyncSendBatchBarrier(ep);
     }
     PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index be53a1a32b..58a3ca8272 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -50,10 +50,10 @@ class SendOp : public framework::OperatorBase {
     std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
-        VLOG(30) << "sending " << ins[i] << " to " << epmap[i];
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
         rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
       } else {
-        VLOG(30) << "don't send no-initialied variable: " << ins[i];
+        VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
     if (sync_send) {
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc b/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
index bf798a8251..a6e1805cdd 100644
--- a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
+++ b/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
@@ -120,7 +120,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
 void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
   f::Scope scope;
   p::CPUPlace place;
-  VLOG(40) << "before init tensor";
+  VLOG(4) << "before init tensor";
   if (is_sparse) {
     InitSelectedRowsInScope(place, &scope);
   } else {
@@ -146,7 +146,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
   attrs.insert({"PrefetchBlock", prefetch_block});
   attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
   attrs.insert({"sync_mode", true});
-  VLOG(40) << "before init op";
+  VLOG(4) << "before init op";
   listen_and_serv_op =
       f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
   *initialized = true;
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.h b/paddle/fluid/operators/distributed_ops/split_byref_op.h
index 3b7ae6fc91..fedd7218dd 100644
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.h
+++ b/paddle/fluid/operators/distributed_ops/split_byref_op.h
@@ -32,7 +32,7 @@ class SplitByrefOpKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < outs.size(); ++i) {
       // NOTE: no need to call mutable_data here to allocate memory.
       auto* out = outs[i];
-      VLOG(30) << "spliting by ref: " << row_offset << " " << out->dims()[0];
+      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
       *out = in->Slice(row_offset, row_offset + out->dims()[0]);
       row_offset += out->dims()[0];
     }
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h
index f5d6d85d7d..acc9b1e622 100644
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.h
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.h
@@ -44,7 +44,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < ids_tensors.size(); ++i) {
         batch_size += ids_tensors[i]->dims()[0];
       }
-      VLOG(40) << "Get Total BatchSize is: " << batch_size;
+      VLOG(4) << "Get Total BatchSize is: " << batch_size;
 
       std::vector<T> all_ids(batch_size);
       int offset = 0;
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index dd3474dd25..2ccc86c1dc 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -120,6 +120,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
                       "Dimensions of Input(X) and Mask must be the same.");
 
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
   }
 };
 
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index 424d273c34..3e401d1c4f 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifndef _WIN32
 #include <unistd.h>
+#endif
 
 #include <string>
 #include <thread>  // NOLINT
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
index 10290a4aef..c600d1e3d7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
@@ -19,36 +19,21 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include "xbyak.h"
-#include "xbyak_util.h"
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
 
 namespace paddle {
 namespace operators {
 
 using framework::DataLayout;
 using mkldnn::memory;
-
-static mkldnn::memory::format StringToMKLDNNFormat(std::string& format) {
-  std::transform(format.begin(), format.end(), format.begin(), ::tolower);
-
-  if (!format.compare("nchw")) {
-    return memory::format::nchw;
-  } else if (!format.compare("nchw16c")) {
-    return memory::format::nChw16c;
-  } else if (!format.compare("nchw8c")) {
-    return memory::format::nChw8c;
-  } else if (!format.compare("nhwc")) {
-    return memory::format::nhwc;
-  } else {
-    return memory::format::any;
-  }
-}
+using platform::StringToMKLDNNFormat;
 
 static void UpdateDataFormat(const framework::ExecutionContext& ctx,
                              framework::Tensor* tensor, const char* attribute) {
   if (ctx.op().HasAttr(attribute)) {
     auto format_as_string = ctx.Attr<std::string>(attribute);
-    auto format = StringToMKLDNNFormat(format_as_string);
+    auto format = StringToMKLDNNFormat(&format_as_string);
     if (format != memory::format::any) {
       tensor->set_format(format);
     }
@@ -93,8 +78,8 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
     auto y_dims_untrimmed = y->dims();
     auto x_int_dims = paddle::framework::vectorize2int(x_dims);
 
-    UpdateDataFormat(ctx, (Tensor*)x, "x_data_format");
-    UpdateDataFormat(ctx, (Tensor*)y, "y_data_format");
+    UpdateDataFormat(ctx, const_cast<Tensor*>(x), "x_data_format");
+    UpdateDataFormat(ctx, const_cast<Tensor*>(y), "y_data_format");
 
     Xbyak::util::Cpu cpu;
     const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
@@ -156,10 +141,10 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
         auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
         const auto& mkldnn_engine = dev_ctx.GetEngine();
         if (!(is_x_nchw || is_x_nc))
-          ReorderInput<T>((Tensor*)x, ctx.GetPlace(), mkldnn_engine,
+          ReorderInput<T>(const_cast<Tensor*>(x), ctx.GetPlace(), mkldnn_engine,
                           x->dims().size() == 4);
         if (!(is_y_nchw || is_y_nc))
-          ReorderInput<T>((Tensor*)y, ctx.GetPlace(), mkldnn_engine,
+          ReorderInput<T>(const_cast<Tensor*>(y), ctx.GetPlace(), mkldnn_engine,
                           y->dims().size() == 4);
       }
 
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 7e34d1019c..25b7ae7c28 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -183,24 +183,27 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   const int total_T = x_dims[0];           \
   const int D3 = wh_dims[1]
 
-#define INIT_OTHER_DEFINES                                                     \
-  auto* h0 = ctx.Input<Tensor>("H0");                                          \
-  auto* wx = ctx.Input<Tensor>("WeightX");                                     \
-  auto* bias = ctx.Input<Tensor>("Bias");                                      \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                          \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");                              \
-  const int M = x_dims[1];                                                     \
-  const int D = wh_dims[0];                                                    \
-  const int D2 = D * 2;                                                        \
-  const auto& ker = math::jitkernel::KernelPool::Instance()                    \
-                        .template Get<math::jitkernel::GRUKernel<T>,           \
-                                      const std::string&, const std::string&>( \
-                            ctx.Attr<std::string>("gate_activation"),          \
-                            ctx.Attr<std::string>("activation"), D);           \
-  const T* x_data = x->data<T>();                                              \
-  const T* wx_data = wx->data<T>();                                            \
-  const T* wh_data = wh->data<T>();                                            \
-  auto place = ctx.GetPlace();                                                 \
+#define INIT_OTHER_DEFINES                                         \
+  auto* h0 = ctx.Input<Tensor>("H0");                              \
+  auto* wx = ctx.Input<Tensor>("WeightX");                         \
+  auto* bias = ctx.Input<Tensor>("Bias");                          \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");              \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");                  \
+  const int M = x_dims[1];                                         \
+  const int D = wh_dims[0];                                        \
+  const int D2 = D * 2;                                            \
+  const math::jitkernel::gru_attr_t attr(                          \
+      D, ctx.Attr<std::string>("gate_activation"),                 \
+      ctx.Attr<std::string>("activation"));                        \
+  math::jitkernel::gru_t one_step;                                 \
+  const auto& ker =                                                \
+      math::jitkernel::KernelPool::Instance()                      \
+          .template Get<math::jitkernel::GRUKernel<T>,             \
+                        const math::jitkernel::gru_attr_t&>(attr); \
+  const T* x_data = x->data<T>();                                  \
+  const T* wx_data = wx->data<T>();                                \
+  const T* wh_data = wh->data<T>();                                \
+  auto place = ctx.GetPlace();                                     \
   T* xx_data = xx->mutable_data<T>(place)
 
   void SeqCompute(const framework::ExecutionContext& ctx) const {
@@ -237,7 +240,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       if (h0_data) {
         prev_hidden_data = h0_data + bid * D;
       } else {
-        ker->ComputeH1(xx_data, hidden_out_data);
+        one_step.gates = xx_data;
+        one_step.ht = hidden_out_data;
+        ker->ComputeH1(&one_step, &attr);
         prev_hidden_data = hidden_out_data;
         tstart = 1;
         move_step();
@@ -247,12 +252,15 @@ class FusionGRUKernel : public framework::OpKernel<T> {
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
                   prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
                   D3);
-        ker->ComputeHtPart1(xx_data, prev_hidden_data, hidden_out_data);
+        one_step.gates = xx_data;
+        one_step.ht_1 = prev_hidden_data;
+        one_step.ht = hidden_out_data;
+        ker->ComputeHtPart1(&one_step, &attr);
         // gemm rt * Ws
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
                   hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
                   xx_data + D2, D3);
-        ker->ComputeHtPart2(xx_data, prev_hidden_data, hidden_out_data);
+        ker->ComputeHtPart2(&one_step, &attr);
         // save prev
         prev_hidden_data = hidden_out_data;
         move_step();
@@ -314,7 +322,9 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       T* cur_out_data = batched_out_data;
       // W: {W_update, W_reset; W_state}
       for (int i = 0; i < max_bs; ++i) {
-        ker->ComputeH1(cur_in_data, cur_out_data);
+        one_step.gates = cur_in_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeH1(&one_step, &attr);
         // add offset
         cur_in_data += D3;
         cur_out_data += D;
@@ -339,8 +349,11 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       T* cur_out_data = batched_out_data;
       T* cur_prev_hidden_data = prev_hidden_data;
       for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeHtPart1(cur_batched_data, cur_prev_hidden_data,
-                            cur_out_data);
+        one_step.gates = cur_batched_data;
+        one_step.ht_1 = cur_prev_hidden_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeHtPart1(&one_step, &attr);
+
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
@@ -354,8 +367,10 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 
       cur_prev_hidden_data = prev_hidden_data;
       for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeHtPart2(cur_batched_data, cur_prev_hidden_data,
-                            cur_out_data);
+        one_step.gates = cur_batched_data;
+        one_step.ht_1 = cur_prev_hidden_data;
+        one_step.ht = cur_out_data;
+        ker->ComputeHtPart2(&one_step, &attr);
         cur_batched_data += D3;
         cur_prev_hidden_data += D;
         cur_out_data += D;
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 0959539068..8021a896ce 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -236,27 +236,31 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
   const int D = wh_dims[0];                                 \
   const int D4 = wh_dims[1]
 
-#define INIT_OTHER_DEFINES                                                  \
-  const T* x_data = x->data<T>();                                           \
-  const T* wx_data = wx->data<T>();                                         \
-  const T* wh_data = wh->data<T>();                                         \
-  /* diagonal weight*/                                                      \
-  const T* wp_data = bias->data<T>() + D4;                                  \
-  /* for peephole only*/                                                    \
-  T* checked_cell_data = nullptr;                                           \
-  auto place = ctx.GetPlace();                                              \
-  if (use_peepholes) {                                                      \
-    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                        \
-    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");                 \
-    checked_cell_data = checked_cell->mutable_data<T>(place);               \
-  }                                                                         \
-  const auto& ker =                                                         \
-      math::jitkernel::KernelPool::Instance()                               \
-          .template Get<math::jitkernel::LSTMKernel<T>, const std::string&, \
-                        const std::string&, const std::string&>(            \
-              ctx.Attr<std::string>("gate_activation"),                     \
-              ctx.Attr<std::string>("candidate_activation"),                \
-              ctx.Attr<std::string>("cell_activation"), D, use_peepholes)
+#define INIT_OTHER_DEFINES                                      \
+  const T* x_data = x->data<T>();                               \
+  const T* wx_data = wx->data<T>();                             \
+  const T* wh_data = wh->data<T>();                             \
+  /* diagonal weight*/                                          \
+  const T* wp_data = bias->data<T>() + D4;                      \
+  /* for peephole only*/                                        \
+  T* checked_cell_data = nullptr;                               \
+  auto place = ctx.GetPlace();                                  \
+  if (use_peepholes) {                                          \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/            \
+    auto* checked_cell = ctx.Output<Tensor>("CheckedCell");     \
+    checked_cell_data = checked_cell->mutable_data<T>(place);   \
+  }                                                             \
+  const math::jitkernel::lstm_attr_t attr(                      \
+      D, ctx.Attr<std::string>("gate_activation"),              \
+      ctx.Attr<std::string>("candidate_activation"),            \
+      ctx.Attr<std::string>("cell_activation"), use_peepholes); \
+  math::jitkernel::lstm_t one_step;                             \
+  one_step.wp = wp_data;                                        \
+  one_step.checked = checked_cell_data;                         \
+  const auto& ker =                                             \
+      math::jitkernel::KernelPool::Instance()                   \
+          .template Get<math::jitkernel::LSTMKernel<T>,         \
+                        const math::jitkernel::lstm_attr_t&>(attr)
 
 // Wh GEMM
 #define GEMM_WH_ADDON(bs, prev, out)                                           \
@@ -299,7 +303,10 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
         prev_h_data = h0_data + bid * D;
         prev_c_data = c0_data + bid * D;
       } else {
-        ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data);
+        one_step.gates = xx_data;
+        one_step.ct = c_out_data;
+        one_step.ht = h_out_data;
+        ker->ComputeC1H1(&one_step, &attr);
         tstart = 1;
         // move one step
         prev_h_data = h_out_data;
@@ -310,8 +317,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       }
       for (int step = tstart; step < seq_len; ++step) {
         GEMM_WH_ADDON(1, prev_h_data, xx_data);
-        ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data,
-                         checked_cell_data);
+
+        one_step.gates = xx_data;
+        one_step.ct_1 = prev_c_data;
+        one_step.ct = c_out_data;
+        one_step.ht = h_out_data;
+        ker->ComputeCtHt(&one_step, &attr);
         // move one step
         prev_h_data = h_out_data;
         prev_c_data = c_out_data;
@@ -388,7 +399,11 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       T* cur_h_out_data = batched_h_out_data;
       T* cur_c_out_data = batched_c_out_data;
       for (int i = 0; i < max_bs; ++i) {
-        ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data);
+        one_step.gates = cur_in_data;
+        one_step.ct = cur_c_out_data;
+        one_step.ht = cur_h_out_data;
+        ker->ComputeC1H1(&one_step, &attr);
+
         cur_in_data += D4;
         cur_c_out_data += D;
         cur_h_out_data += D;
@@ -413,8 +428,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
       T* cur_c_out_data = batched_c_out_data;
       T* cur_h_out_data = batched_h_out_data;
       for (int i = 0; i < cur_bs; ++i) {
-        ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
-                         cur_h_out_data, wp_data, checked_cell_data);
+        one_step.gates = cur_in_data;
+        one_step.ct_1 = cur_prev_c_data;
+        one_step.ct = cur_c_out_data;
+        one_step.ht = cur_h_out_data;
+        ker->ComputeCtHt(&one_step, &attr);
+
         // move one batch
         cur_in_data += D4;
         cur_prev_c_data += D;
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index dadd054b9a..972dcf5494 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
+#include <string>
 #include <vector>
-
 namespace paddle {
 namespace operators {
 
@@ -70,13 +70,14 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
     const int64_t batch_size = ctx->GetInputDim("X")[0];
     std::vector<int64_t> output_shape({batch_size, 1});
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
         ctx.GetPlace());
   }
 };
@@ -86,27 +87,40 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(Tensor, required) The input tensor with shape [N, D], "
+             "(LoDTensor, required) The input tensor with shape [N, D], "
              "where N is the size of mini-batch, and D is the feature size.");
     AddInput("W",
-             "(Tensor, required), The parameters of hierarchical "
+             "(LoDTensor, required), The parameters of hierarchical "
              "sigmoid operator, each of them is a 2-D tensor, the shape is"
-             "[num_classes - 1, D].");
+             "[K, D]. Which K is the num of non-leaf node in Path Tree");
     AddInput("Label",
-             "(Tensor, required), The labels of training data. It's a"
+             "(LoDTensor, required), The labels of training data. It's a"
              "tensor with shape [N, 1].");
+    AddInput("PTable",
+             "(LoDTensor, optional), The Path Table from root to current word"
+             "it should have shape like [N, L], L is the length of the Path")
+        .AsDispensable();
+    AddInput(
+        "PathCode",
+        "(LoDTensor, optional), The Code on each Node of the Path from root "
+        "to current word"
+        "it should have shape like [N, L], L is the length of the Path")
+        .AsDispensable();
     AddInput("Bias",
-             "(Tensor, optional), The bias is a tensor with shape"
-             "[1, num_classes - 1].");
-    AddOutput("Out",
-              "(Tensor, required) The output of hierarchical sigmoid operator."
-              "The shape is [N, 1].");
+             "(LoDTensor, optional), The bias is a tensor with shape or "
+             "[num_classes, 1]"
+             "[num_classes - 1, 1].")
+        .AsDispensable();
+    AddOutput(
+        "Out",
+        "(LoDTensor, required) The output of hierarchical sigmoid operator."
+        "The shape is [N, 1].");
     AddOutput("PreOut",
-              "(Tensor, required) A intermedia 2-D tensor with shape "
+              "(LoDTensor, required) A intermedia 2-D tensor with shape "
               "[batch_size, code_length], where code_length represents the "
               "maximum path length from root to leaf nodes.")
         .AsIntermediate();
-    AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
+    AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
         .SetDefault(2);
     AddComment(R"DOC(
 The hierarchical sigmoid operator organize the classes into a binary tree.
@@ -115,6 +129,10 @@ belonging to the right branch. This idea is from
 "F. Morin, Y. Bengio (AISTATS 05):
 Hierarchical Probabilistic Neural Network Language Model."
       )DOC");
+    AddAttr<bool>("is_sparse",
+                  "(boolean, default false) "
+                  "Sparse update.")
+        .SetDefault(false);
   }
 };
 
@@ -124,16 +142,21 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@Grad) should not be null");
     PADDLE_ENFORCE(ctx->HasInput("PreOut"),
                    "Input(Preout) should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
-                   "Output(W@Grad should not be null.)");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")));
-    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-      ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Bias"));
+                   "Output(W@Grad should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@Grad should not be null.");
+    if (!ctx->Attrs().Get<bool>("is_sparse")) {
+      if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+        ctx->SetOutputDim(framework::GradVarName("Bias"),
+                          ctx->GetInputDim("Bias"));
+      }
+      ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
     }
-    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 
@@ -141,11 +164,55 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
         ctx.GetPlace());
   }
 };
 
+class HierarchicalSigmoidGradOpGradVarTypeInference
+    : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto w_grad_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto bias_grad_var_name_vec =
+        op_desc.Output(framework::GradVarName("Bias"));
+    std::string bias_grad_var_name;
+    bool hasBias = false;
+    if (bias_grad_var_name_vec.size()) {
+      hasBias = true;
+      bias_grad_var_name =
+          op_desc.Output(framework::GradVarName("Bias")).front();
+    }
+    auto attr = op_desc.GetAttr("is_sparse");
+    bool is_sparse = boost::get<bool>(attr);
+    if (is_sparse) {
+      VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
+               << " is set to SelectedRows";
+      block->Var(w_grad_var_name)
+          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      if (hasBias) {
+        VLOG(30) << "hierarchical_sigmoid_grad op "
+                 << framework::GradVarName("Bias") << " is set to SelectedRows";
+        block->Var(bias_grad_var_name)
+            ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      }
+    } else {
+      VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
+               << " is set to LoDTensor";
+      block->Var(w_grad_var_name)
+          ->SetType(framework::proto::VarType::LOD_TENSOR);
+      if (hasBias) {
+        VLOG(30) << "hierarchical_sigmoid_grad op "
+                 << framework::GradVarName("Bias") << " is set to LoDTensor";
+        block->Var(bias_grad_var_name)
+            ->SetType(framework::proto::VarType::LOD_TENSOR);
+      }
+    }
+    block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -153,7 +220,8 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
                   ops::HierarchicalSigmoidOpMaker<int>,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp);
+REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp,
+                  ops::HierarchicalSigmoidGradOpGradVarTypeInference);
 REGISTER_OP_CPU_KERNEL(
     hierarchical_sigmoid,
     ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 79980cda53..07ff8f947e 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -14,12 +14,16 @@ limitations under the License. */
 
 #pragma once
 #include <iostream>
+#include <set>
 #include <vector>
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include "paddle/fluid/platform/transform.h"
+
 namespace paddle {
 namespace operators {
 
@@ -28,20 +32,38 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 using platform::Transform;
 
+static std::vector<int64_t> PathToRows(const framework::LoDTensor& path) {
+  std::set<int64_t> rows;
+  for (int64_t i = 0; i < path.numel(); ++i) {
+    int64_t row = path.data<int64_t>()[i];
+    if (row < 0) {
+      continue;
+    }
+    rows.emplace(row);
+  }
+  return std::vector<int64_t>(rows.begin(), rows.end());
+}
 template <typename DeviceContext, typename T>
 class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* w = ctx.Input<framework::Tensor>("W");
-    auto* label = ctx.Input<framework::Tensor>("Label");
-    auto* bias = ctx.Input<framework::Tensor>("Bias");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* pre_out = ctx.Output<framework::Tensor>("PreOut");
+    auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
+    auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
+    auto* path = ctx.Input<framework::LoDTensor>("PTable");
+    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
+    auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
+    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
     size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-    int64_t code_length = math::FindLastSet(num_classes - 1);
-    int64_t batch_size = in->dims()[0];
-    framework::Tensor sum;
+    bool is_custom = false;
+    if (path) {
+      is_custom = true;
+    }
+    int64_t code_length =
+        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
+    int64_t batch_size = in.dims()[0];
+    framework::LoDTensor sum;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto* pre_out_data = pre_out->mutable_data<T>(
         framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
@@ -52,7 +74,15 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
     zero(dev_ctx, pre_out, static_cast<T>(0.0));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
     math::RowwiseSum<DeviceContext, T> row_sum;
-    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
+    if (!is_custom) {
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
+                                                       label.data<int64_t>()));
+    } else {
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
+                                                       label.data<int64_t>()));
+    }
 
     std::vector<int64_t> sum_dims({batch_size, 1UL});
     sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
@@ -60,15 +90,15 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(ctx.GetPlace());
     auto out_mat = framework::EigenVector<T>::Flatten(*out);
     if (bias) {
-      bit_code.Add(pre_out, *bias);
+      bit_code->Add(*bias, pre_out);
     }
-    bit_code.Mul(pre_out, *w, *in);
+    bit_code->Mul(pre_out, w, in);
     // clip to [-40, 40]
     Transform<DeviceContext> trans;
     trans(ctx.template device_context<DeviceContext>(), pre_out_data,
           pre_out_data + pre_out->numel(), pre_out_data,
           ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
-    bit_code.Sum(*pre_out, out, static_cast<T>(-1));
+    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
     // use softrelu to calculate cross entropy
     pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
     row_sum(dev_ctx, *pre_out, &sum);
@@ -84,50 +114,103 @@ template <typename DeviceContext, typename T>
 class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto* w = ctx.Input<framework::Tensor>("W");
-    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* w_grad = ctx.Output<framework::Tensor>(framework::GradVarName("W"));
-    auto* bias_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
-    auto* label = ctx.Input<framework::Tensor>("Label");
-    auto* pre_out = ctx.Input<framework::Tensor>("PreOut");
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor pre_out_grad;
-
-    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
-    in_grad->mutable_data<T>(ctx.GetPlace());
-    w_grad->mutable_data<T>(ctx.GetPlace());
+    auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
+    auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
+    auto* path = ctx.Input<framework::LoDTensor>("PTable");
+    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
+    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
+    auto* in_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    bool is_sparse = ctx.Attr<bool>("is_sparse");
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     math::SetConstant<DeviceContext, T> zero;
+    auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
+    auto& pre_out = detail::Ref(ctx.Input<framework::LoDTensor>("PreOut"));
+    auto& out_grad = detail::Ref(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out")));
+    framework::LoDTensor pre_out_grad;
+
+    pre_out_grad.mutable_data<T>(pre_out.dims(), ctx.GetPlace());
+    in_grad->mutable_data<T>(ctx.GetPlace());
     zero(dev_ctx, in_grad, static_cast<T>(0.0));
-    zero(dev_ctx, w_grad, static_cast<T>(0.0));
 
     size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    bool is_custom = false;
+    if (path) {
+      is_custom = true;
+    }
+
+    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
+    if (!is_custom) {
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
+                                                       label.data<int64_t>()));
+    } else {
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
+                                                       label.data<int64_t>()));
+    }
 
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    auto pre_out_mat = EigenMatrix<T>::From(pre_out);
     auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
-    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    auto out_grad_mat = EigenMatrix<T>::From(out_grad);
+
     Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
 
     // softrelu derivative
     pre_out_grad_mat.device(place) =
         static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
-    bit_code.Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
+    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
     pre_out_grad_mat.device(place) =
         pre_out_grad_mat * out_grad_mat.broadcast(bcast);
     // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
     // be consistent with the clipping in forward.
-    if (bias_grad) {
-      bias_grad->mutable_data<T>(ctx.GetPlace());
-      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
-      bit_code.AddGrad(pre_out_grad, bias_grad);
+
+    if (!is_sparse) {
+      auto* bias_grad =
+          ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
+      if (bias_grad) {
+        bias_grad->mutable_data<T>(ctx.GetPlace());
+        zero(dev_ctx, bias_grad, static_cast<T>(0.0));
+        bit_code->AddGrad(pre_out_grad, bias_grad);
+      }
+      auto* w_grad =
+          ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
+      w_grad->mutable_data<T>(ctx.GetPlace());
+      zero(dev_ctx, w_grad, static_cast<T>(0.0));
+      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
+    } else {
+      framework::Vector<int64_t> real_rows = PathToRows(*path);
+      auto* w_grad =
+          ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
+      w_grad->set_rows(real_rows);
+      // Build a map of id -> row_index to speed up finding the index of one id
+      w_grad->SyncIndex();
+      w_grad->set_height(w.dims()[0]);
+      auto* w_grad_value = w_grad->mutable_value();
+      framework::DDim temp_dim(w.dims());
+      set(temp_dim, 0, real_rows.size());
+
+      w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
+      zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
+      auto* bias_grad =
+          ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
+      if (bias_grad) {
+        bias_grad->set_rows(real_rows);
+        // build ids -> rows index map
+        bias_grad->SyncIndex();
+        bias_grad->set_height(bias->dims()[0]);
+        auto* bias_grad_value = bias_grad->mutable_value();
+        std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
+                                     bias->dims()[1]};
+        bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
+                                         ctx.GetPlace());
+        zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
+        bit_code->AddGrad(pre_out_grad, bias_grad);
+      }
+      bit_code->MulGradWeight(pre_out_grad, w_grad, in);
     }
-    bit_code.MulGradWeight(pre_out_grad, w_grad, *in);
-    bit_code.MulGradError(pre_out_grad, *w, in_grad);
+    bit_code->MulGradError(pre_out_grad, w, in_grad);
   }
 };
 
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 8f979e05d3..4d25822259 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -76,11 +76,12 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddAttr<int>("out_h", "output height of interpolate op.");
     AddAttr<int>("out_w", "output width of interpolate op.");
-    AddAttr<std::string>(
-        "interp_method",
-        "(string), interpolation method, can be \"bilinear\" for "
-        "bilinear interpolation and \"nearest\" for nearest "
-        "neighbor interpolation.");
+    AddAttr<std::string>("interp_method",
+                         "(string, default \"bilinear\"), interpolation "
+                         "method, can be \"bilinear\" for "
+                         "bilinear interpolation and \"nearest\" for nearest "
+                         "neighbor interpolation.")
+        .SetDefault("bilinear");
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"
@@ -132,11 +133,19 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(interpolate, ops::InterpolateOp, ops::InterpolateOpMaker,
+REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(interpolate_grad, ops::InterpolateOpGrad);
-REGISTER_OP_CPU_KERNEL(interpolate, ops::InterpolateKernel<float>,
+REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad);
+REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad);
+REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel<float>,
+                       ops::InterpolateKernel<double>,
+                       ops::InterpolateKernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_grad, ops::InterpolateGradKernel<float>,
+                       ops::InterpolateGradKernel<double>);
+REGISTER_OP_CPU_KERNEL(nearest_interp, ops::InterpolateKernel<float>,
                        ops::InterpolateKernel<double>,
                        ops::InterpolateKernel<uint8_t>);
-REGISTER_OP_CPU_KERNEL(interpolate_grad, ops::InterpolateGradKernel<float>,
+REGISTER_OP_CPU_KERNEL(nearest_interp_grad, ops::InterpolateGradKernel<float>,
                        ops::InterpolateGradKernel<double>);
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 190afbdac4..99ac725f73 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -284,9 +284,15 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(interpolate, ops::InterpolateOpCUDAKernel<float>,
+REGISTER_OP_CUDA_KERNEL(bilinear_interp, ops::InterpolateOpCUDAKernel<float>,
                         ops::InterpolateOpCUDAKernel<double>,
                         ops::InterpolateOpCUDAKernel<int>);
-REGISTER_OP_CUDA_KERNEL(interpolate_grad,
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad,
+                        ops::InterpolateGradOpCUDAKernel<float>,
+                        ops::InterpolateGradOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp, ops::InterpolateOpCUDAKernel<float>,
+                        ops::InterpolateOpCUDAKernel<double>,
+                        ops::InterpolateOpCUDAKernel<int>);
+REGISTER_OP_CUDA_KERNEL(nearest_interp_grad,
                         ops::InterpolateGradOpCUDAKernel<float>,
                         ops::InterpolateGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 59ef9cb626..166952fe23 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -30,9 +30,9 @@ class LoDRankTableOp : public framework::OperatorBase {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
-    VLOG(100) << "Level = " << static_cast<size_t>(Attr<int>("level"));
+    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
     out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
-    VLOG(100) << Input("X") << "'s lod information is " << *out;
+    VLOG(10) << Input("X") << "'s lod information is " << *out;
   }
 };
 
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index e72337a3e6..145d2db118 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -192,6 +192,10 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
     // The first dim of each LoDTensor in Output can only be set at run-time.;
     // We still have to Resize each LoDTensor in Output.
     context->SetOutputDim("Out", x_dim);
+    // The lod level should be passed to out in compile time.
+    if (!context->IsRuntime()) {
+      context->DecreaseLoDLevel("X", /*->*/ "Out");
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 1878dfe8a8..0029932bc0 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -87,6 +87,25 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "If the grad op reuse the input's variable.")
         .SetDefault(false);
+
+    // for parameter prefetch
+    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<int>>("height_sections",
+                              "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "table_names",
+        "(string vector, the splited table names that will be fetched from "
+        "parameter server)"
+        "in the order of input variables for mapping")
+        .SetDefault({});
+
     AddComment(R"DOC(
 Lookup Table Operator.
 
@@ -134,13 +153,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
     auto attr = op_desc.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
-      VLOG(30) << "lookup_table_grad op " << framework::GradVarName("W")
-               << " is set to SelectedRows";
+      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
+              << " is set to SelectedRows";
       block->Var(out_var_name)
           ->SetType(framework::proto::VarType::SELECTED_ROWS);
     } else {
-      VLOG(30) << "lookup_table_grad op " << framework::GradVarName("W")
-               << " is set to LoDTensor";
+      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
+              << " is set to LoDTensor";
       block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
     }
     block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index abd5dce8f7..6a0d6bad51 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -31,8 +31,8 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids,
 
   while (idy < K) {
     int64_t id = ids[idy];
-    PADDLE_ASSERT(id >= 0);
-    PADDLE_ASSERT(id < N);
+    PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id);
+    PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id);
     T *out = output + idy * D;
     const T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
@@ -57,9 +57,9 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
   int idy = blockIdx.x + threadIdx.y * GridDimX;
 
   while (idy < K) {
-    int id = ids[idy];
-    PADDLE_ASSERT(id >= 0);
-    PADDLE_ASSERT(id < N);
+    int64_t id = ids[idy];
+    PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id);
+    PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id);
     const T *out = output + idy * D;
     T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
@@ -78,27 +78,47 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
     auto *output_t = context.Output<LoDTensor>("Out");
     int64_t padding_idx = context.Attr<int64_t>("padding_idx");
 
-    size_t N = table_t->dims()[0];
-    size_t D = table_t->dims()[1];
-    size_t K = ids_t->numel();
-
-    auto *ids = ids_t->data<int64_t>();
-    auto *table = table_t->data<T>();
-    auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-
-    if (padding_idx == -1)
-      LookupTable<
-          T, 128, 8, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
-    else
-      LookupTable<
-          T, 128, 8, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+    auto id_name = context.Inputs("Ids").front();
+    auto out_name = context.Outputs("Out").front();
+
+    // for remote prefetch
+    auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    auto height_sections = context.Attr<std::vector<int>>("height_sections");
+    auto table_names = context.Attr<std::vector<std::string>>("table_names");
+
+    if (!epmap.empty()) {
+// if epmap is not empty, then the parameter will be fetched from remote
+// parameter
+// server
+#ifdef PADDLE_WITH_DISTRIBUTE
+      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
+                                       height_sections, context);
+#else
+      PADDLE_THROW(
+          "paddle is not compiled with distribute support, can not do "
+          "parameter prefetch!");
+#endif
+    } else {
+      size_t N = table_t->dims()[0];
+      size_t D = table_t->dims()[1];
+      size_t K = ids_t->numel();
+
+      auto *ids = ids_t->data<int64_t>();
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+
+      if (padding_idx == -1)
+        LookupTable<T, 128, 8, 8, false><<<
+            grids, threads, 0, context.cuda_device_context().stream()>>>(
+            output, table, ids, N, K, D, padding_idx);
+      else
+        LookupTable<T, 128, 8, 8, true><<<
+            grids, threads, 0, context.cuda_device_context().stream()>>>(
+            output, table, ids, N, K, D, padding_idx);
+    }
   }
 };
 
@@ -109,6 +129,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
     auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
+
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index e504c4f0cd..3a73a7637c 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -23,6 +23,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/blas.h"
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -41,44 +45,66 @@ class LookupTableKernel : public framework::OpKernel<T> {
     auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
     auto *table_var = context.InputVar("W");
 
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-    int64_t ids_numel = ids_t->numel();
-
-    if (table_var->IsType<LoDTensor>()) {
-      auto *table_t = context.Input<LoDTensor>("W");
-      int64_t row_number = table_t->dims()[0];
-      int64_t row_width = table_t->dims()[1];
-
-      auto *table = table_t->data<T>();
-      auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * row_width, 0, row_width * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_LT(ids[i], row_number);
-          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
-          memcpy(output + i * row_width, table + ids[i] * row_width,
-                 row_width * sizeof(T));
+    auto id_name = context.Inputs("Ids").front();
+    auto out_name = context.Outputs("Out").front();
+
+    // for remote prefetch
+    auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    auto height_sections = context.Attr<std::vector<int>>("height_sections");
+    auto table_names = context.Attr<std::vector<std::string>>("table_names");
+
+    if (!epmap.empty()) {
+// if epmap is not empty, then the parameter will be fetched from remote
+// parameter
+// server
+#ifdef PADDLE_WITH_DISTRIBUTE
+      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
+                                       height_sections, context);
+#else
+      PADDLE_THROW(
+          "paddle is not compiled with distribute support, can not do "
+          "parameter prefetch!");
+#endif
+    } else {
+      int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+      int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
+      int64_t ids_numel = ids_t->numel();
+
+      if (table_var->IsType<LoDTensor>()) {
+        auto *table_t = context.Input<LoDTensor>("W");
+        int64_t row_number = table_t->dims()[0];
+        int64_t row_width = table_t->dims()[1];
+
+        auto *table = table_t->data<T>();
+        auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+        for (int64_t i = 0; i < ids_numel; ++i) {
+          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+            memset(output + i * row_width, 0, row_width * sizeof(T));
+          } else {
+            PADDLE_ENFORCE_LT(ids[i], row_number);
+            PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
+            memcpy(output + i * row_width, table + ids[i] * row_width,
+                   row_width * sizeof(T));
+          }
         }
-      }
-    } else if (table_var->IsType<SelectedRows>()) {
-      const auto &table_t = table_var->Get<SelectedRows>();
-      int64_t row_width = table_t.value().dims()[1];
-      const auto *table = table_t.value().data<T>();
-      auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * row_width, 0, row_width * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_GE(ids[i], 0);
-          auto id_index = table_t.Index(ids[i]);
-          PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
-          blas.VCOPY(row_width, table + id_index * row_width,
-                     output + i * row_width);
+      } else if (table_var->IsType<SelectedRows>()) {
+        const auto &table_t = table_var->Get<SelectedRows>();
+        int64_t row_width = table_t.value().dims()[1];
+        const auto *table = table_t.value().data<T>();
+        auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+        auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+        for (int64_t i = 0; i < ids_numel; ++i) {
+          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+            memset(output + i * row_width, 0, row_width * sizeof(T));
+          } else {
+            PADDLE_ENFORCE_GE(ids[i], 0);
+            auto id_index = table_t.Index(ids[i]);
+            PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
+            blas.VCOPY(row_width, table + id_index * row_width,
+                       output + i * row_width);
+          }
         }
       }
     }
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 6734df1530..9f3a81f22c 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -168,6 +168,9 @@ class Blas {
   template <typename T>
   void SCAL(int n, const T a, T* x) const;
 
+  template <typename T>
+  T ASUM(int n, T* x, int inc) const;
+
   template <typename T>
   void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
@@ -269,6 +272,11 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template SCAL<T>(args...);
   }
 
+  template <typename... ARGS>
+  T ASUM(ARGS... args) const {
+    return Base()->template ASUM<T>(args...);
+  }
+
   template <typename... ARGS>
   void BatchedGEMM(ARGS... args) const {
     Base()->template BatchedGEMM<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 93bf7c7c88..c84087bb1e 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -84,6 +84,11 @@ struct CBlas<float> {
     platform::dynload::cblas_sscal(args...);
   }
 
+  template <typename... ARGS>
+  static float ASUM(ARGS... args) {
+    return platform::dynload::cblas_sasum(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_sgemm_batch(args...);
@@ -174,6 +179,11 @@ struct CBlas<double> {
     platform::dynload::cblas_dscal(args...);
   }
 
+  template <typename... ARGS>
+  static double ASUM(ARGS... args) {
+    return platform::dynload::cblas_dasum(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_dgemm_batch(args...);
@@ -268,6 +278,7 @@ struct CBlas<platform::float16> {
   static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
   static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
   static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
+  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
     PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -476,6 +487,21 @@ void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
 #endif
 }
 
+template <>
+template <typename T>
+T Blas<platform::CPUDeviceContext>::ASUM(int n, T *x, int inc) const {
+  auto sum = static_cast<T>(0.0);
+#ifdef PADDLE_WITH_MKLML
+  sum = CBlas<T>::ASUM(n, x, inc);
+#else
+  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
+  for (int c = 0; c < n; ++c) {
+    sum += x[c];
+  }
+#endif
+  return sum;
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index 18a586f8dd..c37fa291a2 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/time.h>
 #include <cmath>
 #include <cstring>
 #include <random>
@@ -22,6 +21,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/platform/port.h"
 
 inline double GetCurrentUS() {
   struct timeval time;
@@ -96,8 +96,8 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
   }
   auto et = GetCurrentUS();
 
-  VLOG(30) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
-           << " us, tgt takes: " << (mt - st) / repeat;
+  VLOG(3) << "Vec size " << n << ": refer takes: " << (et - mt) / repeat
+          << " us, tgt takes: " << (mt - st) / repeat;
   for (int i = 0; i < n; ++i) {
     EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
   }
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index ae2c90b33a..521cd7801a 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
-#include <sys/time.h>
 #include <vector>
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
+#include "paddle/fluid/platform/port.h"
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index e484e9a3c7..52cbdf685d 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/jit_code.h"
+#include <stddef.h>                                  // offsetof
 #include "paddle/fluid/operators/math/jit_kernel.h"  // TODO(TJ): remove me
 
 namespace paddle {
@@ -139,32 +140,10 @@ bool VActJitCode::init(int d, operand_type type) {
 }
 
 void VActJitCode::generate() {
-  xmm_t xmm_zero = xmm_t(2);
-  ymm_t ymm_zero = ymm_t(2);
-  if (type_ == operand_type::relu) {
-    vxorps(ymm_zero, ymm_zero, ymm_zero);
-  }
   int offset = 0;
   for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
     vmovups(ymm_src, ptr[param1 + offset]);
-    switch (type_) {
-      case operand_type::relu:
-        relu_jmm<ymm_t>(ymm_dst, ymm_src, ymm_zero);
-        break;
-      case operand_type::exp:
-        exp_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::sigmoid:
-        sigmoid_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::tanh:
-        tanh_jmm<ymm_t>(ymm_dst, ymm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::identity:
-        break;
-      default:
-        break;
-    }
+    act<ymm_t>(ymm_dst, ymm_src, type_);
     vmovups(ptr[param2 + offset], ymm_dst);
     offset += sizeof(float) * YMM_FLOAT_BLOCK;
   }
@@ -181,22 +160,7 @@ void VActJitCode::generate() {
       block = 1;
       vmovss(xmm_src, ptr[param1 + offset]);
     }
-    switch (type_) {
-      case operand_type::relu:
-        relu_jmm<xmm_t>(xmm_dst, xmm_src, xmm_zero);
-        break;
-      case operand_type::exp:
-        exp_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::sigmoid:
-        sigmoid_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      case operand_type::tanh:
-        tanh_jmm<xmm_t>(xmm_dst, xmm_src, 2, 3, 4, 5);
-        break;
-      default:
-        break;
-    }
+    act<xmm_t>(xmm_dst, xmm_src, type_);
     if (rest >= 4) {
       vmovups(ptr[param2 + offset], xmm_dst);
     } else if (rest >= 2) {
@@ -210,6 +174,158 @@ void VActJitCode::generate() {
   ret();
 }
 
+bool LSTMJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; }
+
+void LSTMJitCode::generate() {
+  if (use_peephole_) {
+    preCode();
+  }
+  reg64_t reg_ptr_gates = rax;
+  reg64_t reg_ptr_ct_1 = r9;
+  reg64_t reg_ptr_ct = r10;
+  reg64_t reg_ptr_ht = r11;
+  reg64_t reg_ptr_wp = r12;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
+  mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
+  mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
+  if (use_peephole_) {
+    mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]);
+  }
+
+  int offset = 0;
+  int d = num_ * sizeof(float);
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    /* gates: W_ch, W_ih, W_fh, W_oh */
+    ymm_t ymm_c = ymm_t(0);
+    ymm_t ymm_i = ymm_t(1);
+    ymm_t ymm_f = ymm_t(2);
+    ymm_t ymm_o = ymm_t(3);
+    ymm_t ymm_ct_1 = ymm_t(4);
+    ymm_t ymm_wp0 = ymm_t(5);
+    ymm_t ymm_wp1 = ymm_t(6);
+    ymm_t ymm_wp2 = ymm_t(7);
+    vmovups(ymm_c, ptr[reg_ptr_gates + offset]);
+    vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]);
+    vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]);
+    vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]);
+    if (!compute_c1h1_) {
+      vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]);
+    }
+    if (use_peephole_) {
+      vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]);
+      vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]);
+      vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]);
+    }
+    /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */
+    // act_cand(c)
+    act<ymm_t>(ymm_c, ymm_c, act_cand_);
+    // act_gate(i) or act_gate(ct_1 * wp0 + i)
+    if (!compute_c1h1_ && use_peephole_) {
+      vmulps(ymm_wp0, ymm_ct_1, ymm_wp0);
+      vaddps(ymm_i, ymm_i, ymm_wp0);
+    }
+    act<ymm_t>(ymm_i, ymm_i, act_gate_);
+    vmulps(ymm_c, ymm_c, ymm_i);
+    if (!compute_c1h1_) {
+      // act_gate(f) or act_gate(ct_1 * wp1 + f)
+      if (use_peephole_) {
+        vmulps(ymm_wp1, ymm_ct_1, ymm_wp1);
+        vaddps(ymm_f, ymm_f, ymm_wp1);
+      }
+      act<ymm_t>(ymm_f, ymm_f, act_gate_);
+      // ct
+      vmulps(ymm_f, ymm_f, ymm_ct_1);
+      vaddps(ymm_f, ymm_f, ymm_c);
+    }
+    /* H_t = act_cell(C_t) * act_gate(o) */
+    // act_cell(C_t)
+    ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f;
+    ymm_t ymm_tmp = ymm_i;
+    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
+    // act_gate(o) or act_gate(ct * wp2 + o)
+    if (use_peephole_) {
+      vmulps(ymm_wp2, ymm_ct, ymm_wp2);
+      vaddps(ymm_o, ymm_o, ymm_wp2);
+    }
+    act<ymm_t>(ymm_o, ymm_o, act_gate_);
+    // ht
+    vmulps(ymm_o, ymm_o, ymm_tmp);
+    // save ct and ht
+    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);
+    vmovups(ptr[reg_ptr_ht + offset], ymm_o);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+
+  if (use_peephole_) {
+    postCode();
+  } else {
+    ret();
+  }
+}
+
+bool GRUJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; }
+
+void GRUJitCode::generate() {
+  reg64_t reg_ptr_gates = rax;
+  reg64_t reg_ptr_ht_1 = r9;
+  reg64_t reg_ptr_ht = r10;
+  mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]);
+  mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]);
+  mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]);
+  ymm_t ymm_one = ymm_t(0);
+
+  if (id_ == 2) {
+    reg64_t reg_ptr_tmp = r11;
+    mov(reg_ptr_tmp, reinterpret_cast<size_t>(exp_float_consts));
+    vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]);
+  }
+  int offset = 0;
+  int d = num_ * sizeof(float);
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    ymm_t ymm_u = ymm_t(1);
+    ymm_t ymm_r = ymm_t(2);
+    ymm_t ymm_s = ymm_t(3);
+    ymm_t ymm_ht_1 = ymm_t(4);
+    // W: {W_update, W_reset; W_state}
+    if (id_ == 0 || id_ == 2) {
+      vmovups(ymm_u, ptr[reg_ptr_gates + offset]);
+      vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]);
+    }
+    if (id_ == 1) {
+      vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]);
+    }
+    if (id_ == 1 || id_ == 2) {
+      vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]);
+    }
+
+    if (id_ == 0) {
+      // ht = act_gate(u) * act_cand(s)
+      act<ymm_t>(ymm_u, ymm_u, act_gate_);
+      act<ymm_t>(ymm_s, ymm_s, act_cand_);
+      vmulps(ymm_s, ymm_s, ymm_u);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_s);
+    } else if (id_ == 1) {
+      // ht = act_gate(r) * ht_1
+      act<ymm_t>(ymm_r, ymm_r, act_gate_);
+      vmulps(ymm_r, ymm_r, ymm_ht_1);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_r);
+    } else if (id_ == 2) {
+      // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
+      ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx());
+      act<ymm_t>(ymm_u, ymm_u, act_gate_);
+      act<ymm_t>(ymm_s, ymm_s, act_cand_);
+      vmulps(ymm_s, ymm_s, ymm_u);
+      vsubps(ymm_u, ymm_one_inner, ymm_u);
+      vmulps(ymm_u, ymm_ht_1, ymm_u);
+      vaddps(ymm_u, ymm_s, ymm_u);
+      vmovups(ptr[reg_ptr_ht + offset], ymm_u);
+    }
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+
+  ret();
+}
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index 64ef55de7c..a921462129 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include "paddle/fluid/operators/math/jit_gen.h"
+#include "paddle/fluid/operators/math/jit_kernel_impl.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
@@ -46,14 +47,6 @@ extern const float exp_float_consts[];
 extern const int exp_int_0x7f[];
 extern int g_tmp_mem[];
 
-// TODO(TJ): move these to some proper place
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
-
 #define ALIGN32 __attribute__((aligned(32)))
 #define EXP_HIG 88.3762626647949f
 #define EXP_LOW -88.3762626647949f
@@ -176,31 +169,34 @@ class VActJitCode : public JitCode {
  protected:
   // compute relu with ymm, xmm
   template <typename JMM>
-  void relu_jmm(JMM& dst, JMM& src, JMM& zero) {  // NOLINT
+  void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) {  // NOLINT
+    JMM zero = JMM(zero_idx);
+    vxorps(zero, zero, zero);
     vmaxps(dst, src, zero);
   }
 
   // compute exp with ymm, xmm
   template <typename JMM>
-  void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
-               int mask_idx = 4, int tmp_idx = 5) {
-    using namespace platform::jit;         // NOLINT
-    assert(src.getIdx() != dst.getIdx());  // TODO(TJ): use enfore
+  void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12,  // NOLINT
+               int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) {
+    using namespace platform::jit;  // NOLINT
     // check all idx can not equal
+    JMM jmm_src = JMM(src_idx);
     JMM jmm_fx = JMM(fx_idx);
     JMM jmm_fy = JMM(fy_idx);
     JMM jmm_mask = JMM(mask_idx);
     JMM jmm_tmp = JMM(tmp_idx);
     reg64_t reg_ptr_global = rax;
     push(reg_ptr_global);
+    vmovaps(jmm_src, src);
     mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
-    vminps(src, src, jmm_tmp);
+    vminps(jmm_src, jmm_src, jmm_tmp);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
-    vmaxps(src, src, jmm_tmp);
+    vmaxps(jmm_src, jmm_src, jmm_tmp);
     // express exp(x) as exp(g + n*log(2))
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
-    vmulps(jmm_fx, src, jmm_tmp);
+    vmulps(jmm_fx, jmm_src, jmm_tmp);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
     vaddps(jmm_fx, jmm_fx, jmm_tmp);
     vroundps(jmm_fy, jmm_fx, 0x01);
@@ -214,21 +210,21 @@ class VActJitCode : public JitCode {
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
     JMM ymm_z = JMM(jmm_mask.getIdx());
     vmulps(ymm_z, jmm_fx, jmm_tmp);
-    vsubps(src, src, jmm_fy);
-    vsubps(src, src, ymm_z);
-    vmulps(ymm_z, src, src);
+    vsubps(jmm_src, jmm_src, jmm_fy);
+    vsubps(jmm_src, jmm_src, ymm_z);
+    vmulps(ymm_z, jmm_src, jmm_src);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
-    vmulps(dst, src, jmm_tmp);
+    vmulps(dst, jmm_src, jmm_tmp);
     for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
          i += (YMM_FLOAT_BLOCK * sizeof(float))) {
       vmovaps(jmm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
       vaddps(dst, dst, jmm_tmp);
-      vmulps(dst, dst, src);
+      vmulps(dst, dst, jmm_src);
     }
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
     vaddps(dst, dst, jmm_tmp);
     vmulps(dst, dst, ymm_z);
-    vaddps(dst, dst, src);
+    vaddps(dst, dst, jmm_src);
     vmovaps(jmm_tmp, ptr[reg_ptr_global]);
     vaddps(dst, dst, jmm_tmp);
     // build 2^n
@@ -265,20 +261,23 @@ class VActJitCode : public JitCode {
 
   // compute sigmoid with ymm, xmm
   template <typename JMM>
-  void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2,  // NOLINT
-                   int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) {
+  void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
+                   int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
+                   int tmp_idx = 15) {
     // y = 1 / (1 + e^-x)
     JMM jmm_tmp = JMM(tmp_idx);
+    JMM jmm_src = JMM(src_idx);
     reg64_t reg_ptr_global = rax;
     push(reg_ptr_global);
+    vmovaps(jmm_src, src);
     mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
-    vminps(src, src, jmm_tmp);
+    vminps(jmm_src, jmm_src, jmm_tmp);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
-    vmaxps(src, src, jmm_tmp);
+    vmaxps(jmm_src, jmm_src, jmm_tmp);
     vxorps(jmm_tmp, jmm_tmp, jmm_tmp);
-    vsubps(src, jmm_tmp, src);
-    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vsubps(jmm_src, jmm_tmp, jmm_src);
+    exp_jmm<JMM>(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
     vaddps(dst, dst, jmm_tmp);
     vdivps(dst, jmm_tmp, dst);
@@ -287,19 +286,22 @@ class VActJitCode : public JitCode {
 
   // compute tanh with ymm, xmm
   template <typename JMM>
-  void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3,  // NOLINT
-                int mask_idx = 4, int tmp_idx = 5) {
+  void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11,  // NOLINT
+                int fx_idx = 12, int fy_idx = 13, int mask_idx = 14,
+                int tmp_idx = 15) {
     // y = 2 / (1 + e^(-2x)) - 1
+    JMM jmm_src = JMM(src_idx);
     JMM jmm_tmp = JMM(tmp_idx);
     JMM jmm_zero = JMM(mask_idx);
     reg64_t reg_ptr_global = rax;
     push(reg_ptr_global);
+    vmovaps(jmm_src, src);
     mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
     vxorps(jmm_zero, jmm_zero, jmm_zero);
     vsubps(jmm_tmp, jmm_zero, jmm_tmp);
-    vmulps(src, src, jmm_tmp);
-    exp_jmm<JMM>(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx);
+    vmulps(jmm_src, jmm_src, jmm_tmp);
+    exp_jmm<JMM>(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
     vaddps(dst, dst, jmm_tmp);
     vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
@@ -309,6 +311,30 @@ class VActJitCode : public JitCode {
     pop(reg_ptr_global);
   }
 
+  template <typename JMM>
+  void act(JMM& dst, JMM& src, operand_type type) {  // NOLINT
+    // use 11~15
+    switch (type) {
+      case operand_type::relu:
+        relu_jmm<JMM>(dst, src, 15);
+        break;
+      case operand_type::exp:
+        exp_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
+        break;
+      case operand_type::sigmoid:
+        sigmoid_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
+        break;
+      case operand_type::tanh:
+        tanh_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
+        break;
+      case operand_type::identity:
+        break;
+      default:
+        // throw error
+        break;
+    }
+  }
+
  protected:
   int num_;
   operand_type type_;
@@ -322,6 +348,148 @@ class VActJitCode : public JitCode {
   ymm_t ymm_dst = ymm_t(1);
 };
 
+class LSTMJitCode : public VActJitCode {
+ public:
+  const char* name() const override {
+    std::string base = "LSTMJitCode";
+    if (use_peephole_) {
+      base += "_Peephole";
+    }
+    if (compute_c1h1_) {
+      base += "_C1H1";
+    }
+    auto AddTypeStr = [&](operand_type type) {
+      switch (type) {
+        case operand_type::relu:
+          base += "_Relu";
+          break;
+        case operand_type::exp:
+          base += "_Exp";
+          break;
+        case operand_type::sigmoid:
+          base += "_Sigmoid";
+          break;
+        case operand_type::tanh:
+          base += "_Tanh";
+          break;
+        case operand_type::identity:
+          base += "_Identity";
+          break;
+        default:
+          break;
+      }
+    };
+    AddTypeStr(act_gate_);
+    AddTypeStr(act_cand_);
+    AddTypeStr(act_cell_);
+    return base.c_str();
+  }
+
+  explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr,
+                       size_t code_size = 256 * 1024, void* code_ptr = nullptr)
+      : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size,
+                    code_ptr),
+        compute_c1h1_(compute_c1h1) {
+    auto typeExchange = [](const std::string& type) -> gen::operand_type {
+      if (type == "sigmoid") {
+        return operand_type::sigmoid;
+      } else if (type == "relu") {
+        return operand_type::relu;
+      } else if (type == "tanh") {
+        return operand_type::tanh;
+      } else if (type == "identity" || type == "") {
+        return operand_type::identity;
+      }  // else throw error
+      return operand_type::identity;
+    };
+    num_ = attr.d;
+    use_peephole_ = attr.use_peephole;
+    act_gate_ = typeExchange(attr.act_gate);
+    act_cand_ = typeExchange(attr.act_cand);
+    act_cell_ = typeExchange(attr.act_cell);
+  }
+  static bool init(int d);
+  void generate() override;
+
+ protected:
+  int num_;
+  bool compute_c1h1_;
+  bool use_peephole_;
+  operand_type act_gate_;
+  operand_type act_cand_;
+  operand_type act_cell_;
+  reg64_t param1{abi_param1};
+};
+
+class GRUJitCode : public VActJitCode {
+ public:
+  const char* name() const override {
+    std::string base = "GRUJitCode";
+    if (id_ == 0) {
+      base += "_H1";
+    } else if (id_ == 1) {
+      base += "_HtPart1";
+    } else if (id_ == 2) {
+      base += "_HtPart2";
+    }
+    auto AddTypeStr = [&](operand_type type) {
+      switch (type) {
+        case operand_type::relu:
+          base += "_Relu";
+          break;
+        case operand_type::exp:
+          base += "_Exp";
+          break;
+        case operand_type::sigmoid:
+          base += "_Sigmoid";
+          break;
+        case operand_type::tanh:
+          base += "_Tanh";
+          break;
+        case operand_type::identity:
+          base += "_Identity";
+          break;
+        default:
+          break;
+      }
+    };
+    AddTypeStr(act_gate_);
+    AddTypeStr(act_cand_);
+    return base.c_str();
+  }
+
+  explicit GRUJitCode(int id, const gru_attr_t& attr,
+                      size_t code_size = 256 * 1024, void* code_ptr = nullptr)
+      : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size,
+                    code_ptr),
+        id_(id) {
+    auto typeExchange = [](const std::string& type) -> gen::operand_type {
+      if (type == "sigmoid") {
+        return operand_type::sigmoid;
+      } else if (type == "relu") {
+        return operand_type::relu;
+      } else if (type == "tanh") {
+        return operand_type::tanh;
+      } else if (type == "identity" || type == "") {
+        return operand_type::identity;
+      }  // else throw error
+      return operand_type::identity;
+    };
+    num_ = attr.d;
+    act_gate_ = typeExchange(attr.act_gate);
+    act_cand_ = typeExchange(attr.act_cand);
+  }
+  static bool init(int d);
+  void generate() override;
+
+ protected:
+  int id_;
+  int num_;
+  operand_type act_gate_;
+  operand_type act_cand_;
+  reg64_t param1{abi_param1};
+};
+
 #ifdef PADDLE_WITH_MKLDNN
 struct EltwiseMulnChw16cNC : public Xbyak::CodeGenerator {
   explicit EltwiseMulnChw16cNC(size_t code_size = 256 * 1024)
diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h
index 82d808f415..b78b92b4f9 100644
--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <memory>  // for shared_ptr
 #include <string>
 #include <unordered_map>
+#include "paddle/fluid/operators/math/jit_kernel_impl.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -26,14 +27,7 @@ namespace operators {
 namespace math {
 namespace jitkernel {
 
-// TODO(TJ): move these to some proper place
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
-
+// TODO(TJ): remove me
 typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block;
 
 class Kernel {
@@ -128,24 +122,18 @@ class VTanhKernel : public VActKernel<T> {};
 template <typename T>
 class LSTMKernel : public Kernel {
  public:
-  virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht,
-                           /* below only used in peephole*/
-                           const T *wp_data = nullptr,
-                           T *checked = nullptr) const = 0;
-
   // compute c1 and h1 without c0 or h0
-  virtual void ComputeC1H1(T *gates, T *ct, T *ht,
-                           /* below only used in peephole*/
-                           const T *wp_data = nullptr) const = 0;
+  void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *);
+  void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *);
 };
 
 template <typename T>
 class GRUKernel : public Kernel {
  public:
   // compute h1 without h0
-  virtual void ComputeH1(T *gates, T *ht) const = 0;
-  virtual void ComputeHtPart1(T *gates, const T *ht_1, T *ht) const = 0;
-  virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0;
+  void (*ComputeH1)(gru_t *, const gru_attr_t *);
+  void (*ComputeHtPart1)(gru_t *, const gru_attr_t *);
+  void (*ComputeHtPart2)(gru_t *, const gru_attr_t *);
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
index a143b51439..a0f93fd8e7 100644
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/jit_kernel.h"
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
 #include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_XBYAK
@@ -31,49 +32,6 @@ namespace math {
 namespace jitkernel {
 namespace jit = platform::jit;
 
-template <typename T>
-void VMulRefer(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-template <typename T>
-void VAddRefer(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-  }
-}
-
-template <typename T>
-void VAddReluRefer(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-    z[i] = z[i] > 0 ? z[i] : 0;
-  }
-}
-
-template <typename T>
-void VScalRefer(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] * x[i];
-  }
-}
-
-template <typename T>
-void VAddBiasRefer(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] + x[i];
-  }
-}
-
-template <typename T>
-void VReluRefer(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
-}
-
 #ifdef PADDLE_WITH_MKLML
 template <typename T>
 void VMulMKL(const T* x, const T* y, T* z, int n);
@@ -109,7 +67,7 @@ void VScalMKL<float>(const float* a, const float* x, float* y, int n) {
   if (x == y) {
     platform::dynload::cblas_sscal(n, *a, y, 1);
   } else {
-    VScalRefer<float>(a, x, y, n);
+    refer::VScal<float>(a, x, y, n);
   }
 }
 
@@ -118,7 +76,7 @@ void VScalMKL<double>(const double* a, const double* x, double* y, int n) {
   if (x == y) {
     platform::dynload::cblas_dscal(n, *a, y, 1);
   } else {
-    VScalRefer<double>(a, x, y, n);
+    refer::VScal<double>(a, x, y, n);
   }
 }
 
@@ -147,7 +105,7 @@ class VMulKernelImpl : public VMulKernel<T> {
       return;
     }
 #endif
-    this->Compute = VMulRefer<T>;
+    this->Compute = refer::VMul<T>;
   }
 
 #ifdef PADDLE_WITH_XBYAK
@@ -198,7 +156,7 @@ class VAddKernelImpl : public VAddKernel<T> {
       return;
     }
 #endif
-    this->Compute = VAddRefer<T>;
+    this->Compute = refer::VAdd<T>;
   }
 #ifdef PADDLE_WITH_XBYAK
 
@@ -280,7 +238,7 @@ class VAddReluKernelImpl : public VAddReluKernel<T> {
       return;
     }
 #endif
-    this->Compute = VAddReluRefer<T>;
+    this->Compute = refer::VAddRelu<T>;
   }
 #ifdef PADDLE_WITH_XBYAK
 
@@ -318,7 +276,7 @@ class VScalKernelImpl : public VScalKernel<T> {
       return;
     }
 #endif
-    this->Compute = VScalRefer<T>;
+    this->Compute = refer::VScal<T>;
   }
 #ifdef PADDLE_WITH_XBYAK
 
@@ -362,7 +320,7 @@ class VAddBiasKernelImpl : public VAddBiasKernel<T> {
     }
 #endif
 
-    this->Compute = VAddBiasRefer<T>;
+    this->Compute = refer::VAddBias<T>;
   }
 #ifdef PADDLE_WITH_XBYAK
 
@@ -396,7 +354,7 @@ class VReluKernelImpl : public VReluKernel<T> {
     }
 #endif
 
-    this->Compute = VReluRefer<T>;
+    this->Compute = refer::VRelu<T>;
   }
 #ifdef PADDLE_WITH_XBYAK
 
@@ -412,16 +370,13 @@ bool VReluKernelImpl<float>::useJIT(int d) {
 }
 #endif
 
-template <typename T>
-inline void VIdentityRefer(const T* x, T* y, int n) {}
-
 /* An empty JitKernel */
 template <typename T>
 class VIdentityKernelImpl : public VIdentityKernel<T> {
  public:
   JITKERNEL_DECLARE_STATIC_FUNC;
   explicit VIdentityKernelImpl(int d) : VIdentityKernel<T>() {
-    this->Compute = VIdentityRefer<T>;
+    this->Compute = refer::VIdentity<T>;
   }
 };
 
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
index f26815300d..686f3dd983 100644
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include <cmath>  // for exp
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
 
 #ifdef PADDLE_WITH_XBYAK
 #include "paddle/fluid/operators/math/jit_code.h"
@@ -25,48 +25,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
 namespace jitkernel {
 namespace jit = platform::jit;
 
-// TODO(TJ): move refer codes to one file
-// Refer code only focus on correctness
-template <typename T>
-void VExpRefer(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
-template <typename T>
-void VSigmoidRefer(const T* x, T* y, int n) {
-  // y = 1 / (1 + e^-x)
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
-  }
-}
-
-template <typename T>
-void VTanhRefer(const T* x, T* y, int n) {
-  // y = 2 * sigmoid(2x) - 1
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * x[i];
-  }
-  VSigmoidRefer(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
-  }
-}
-
 #ifdef PADDLE_WITH_MKLML
 // try to use MKL to speedup
 template <typename T>
@@ -129,7 +93,7 @@ class VExpKernelImpl : public VExpKernel<T> {
       return;
     }
 #endif
-    this->Compute = VExpRefer<T>;
+    this->Compute = refer::VExp<T>;
   }
 
 #ifdef PADDLE_WITH_XBYAK
@@ -182,7 +146,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
       return;
     }
 #endif
-    this->Compute = VSigmoidRefer<T>;
+    this->Compute = refer::VSigmoid<T>;
   }
 
 #ifdef PADDLE_WITH_XBYAK
@@ -234,7 +198,7 @@ class VTanhKernelImpl : public VTanhKernel<T> {
       return;
     }
 #endif
-    this->Compute = VTanhRefer<T>;
+    this->Compute = refer::VTanh<T>;
   }
 
 #ifdef PADDLE_WITH_XBYAK
@@ -267,154 +231,6 @@ REGISTER_JITKERNEL(vexp, VExpKernel);
 REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel);
 REGISTER_JITKERNEL(vtanh, VTanhKernel);
 
-namespace detail {
-
-#ifdef __AVX__
-
-#define ALIGN32 __attribute__((aligned(32)))
-
-#define _PS256_CONST(Name, Val)                                      \
-  static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
-                                                 Val, Val, Val, Val}
-
-#define _PI256_CONST(Name, Val)                                    \
-  static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
-                                               Val, Val, Val, Val}
-
-_PI256_CONST(0x7f, 0x7f);
-_PS256_CONST(one, 1.f);
-_PS256_CONST(0p5, 0.5f);
-_PS256_CONST(exp_hi, 88.3762626647949f);
-_PS256_CONST(exp_lo, -88.3762626647949f);
-_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
-_PS256_CONST(cephes_exp_C1, 0.693359375);
-_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
-_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
-_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
-_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
-_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
-_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
-_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
-
-typedef union imm_xmm_union {
-  __m256i imm;
-  __m128i xmm[2];
-} imm_xmm_union;
-
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
-  {                                         \
-    imm_xmm_union u ALIGN32;                \
-    u.imm = imm_;                           \
-    xmm0_ = u.xmm[0];                       \
-    xmm1_ = u.xmm[1];                       \
-  }
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
-  {                                         \
-    imm_xmm_union u ALIGN32;                \
-    u.xmm[0] = xmm0_;                       \
-    u.xmm[1] = xmm1_;                       \
-    imm_ = u.imm;                           \
-  }
-
-#define AVX2_BITOP_USING_SSE2(fn)                           \
-  static inline __m256i avx2_mm256_##fn(__m256i x, int y) { \
-    /* use SSE2 to perform the bitop AVX2 */                \
-    __m128i x1, x2;                                         \
-    __m256i ret;                                            \
-    COPY_IMM_TO_XMM(x, x1, x2);                             \
-    x1 = _mm_##fn(x1, y);                                   \
-    x2 = _mm_##fn(x2, y);                                   \
-    COPY_XMM_TO_IMM(x1, x2, ret);                           \
-    return ret;                                             \
-  }
-
-#define AVX2_INTOP_USING_SSE2(fn)                                    \
-  static inline __m256i avx2_mm256_add_epi32(__m256i x, __m256i y) { \
-    /* use SSE2 to perform the AVX2 integer operation */             \
-    __m128i x1, x2;                                                  \
-    __m128i y1, y2;                                                  \
-    __m256i ret;                                                     \
-    COPY_IMM_TO_XMM(x, x1, x2);                                      \
-    COPY_IMM_TO_XMM(y, y1, y2);                                      \
-    x1 = _mm_##fn(x1, y1);                                           \
-    x2 = _mm_##fn(x2, y2);                                           \
-    COPY_XMM_TO_IMM(x1, x2, ret);                                    \
-    return ret;                                                      \
-  }
-
-AVX2_BITOP_USING_SSE2(slli_epi32);
-AVX2_INTOP_USING_SSE2(add_epi32);
-
-#define AVXEXP_BASE                                                            \
-  __m256 tmp = _mm256_setzero_ps(), fx;                                        \
-  __m256 one = *reinterpret_cast<const __m256*>(_ps256_one);                   \
-  __m256i imm0;                                                                \
-  x = _mm256_min_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_hi));       \
-  x = _mm256_max_ps(x, *reinterpret_cast<const __m256*>(_ps256_exp_lo));       \
-  /* express exp(x) as exp(g + n*log(2)) */                                    \
-  fx = _mm256_mul_ps(x,                                                        \
-                     *reinterpret_cast<const __m256*>(_ps256_cephes_LOG2EF));  \
-  fx = _mm256_add_ps(fx, *reinterpret_cast<const __m256*>(_ps256_0p5));        \
-  tmp = _mm256_floor_ps(fx);                                                   \
-  /* if greater, substract 1 */                                                \
-  __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);                            \
-  mask = _mm256_and_ps(mask, one);                                             \
-  fx = _mm256_sub_ps(tmp, mask);                                               \
-  tmp = _mm256_mul_ps(fx,                                                      \
-                      *reinterpret_cast<const __m256*>(_ps256_cephes_exp_C1)); \
-  __m256 z = _mm256_mul_ps(                                                    \
-      fx, *reinterpret_cast<const __m256*>(_ps256_cephes_exp_C2));             \
-  x = _mm256_sub_ps(x, tmp);                                                   \
-  x = _mm256_sub_ps(x, z);                                                     \
-  z = _mm256_mul_ps(x, x);                                                     \
-  __m256 y = *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p0);           \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p1));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p2));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p3));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p4));   \
-  y = _mm256_mul_ps(y, x);                                                     \
-  y = _mm256_add_ps(y,                                                         \
-                    *reinterpret_cast<const __m256*>(_ps256_cephes_exp_p5));   \
-  y = _mm256_mul_ps(y, z);                                                     \
-  y = _mm256_add_ps(y, x);                                                     \
-  y = _mm256_add_ps(y, one);                                                   \
-  /* build 2^n */                                                              \
-  imm0 = _mm256_cvttps_epi32(fx)
-
-__m256 ExpAVX(__m256 x) {
-  AVXEXP_BASE;
-  // two AVX2 instructions using SSE2
-  imm0 = avx2_mm256_add_epi32(imm0,
-                              *reinterpret_cast<const __m256i*>(_pi256_0x7f));
-  imm0 = avx2_mm256_slli_epi32(imm0, 23);
-  __m256 pow2n = _mm256_castsi256_ps(imm0);
-  y = _mm256_mul_ps(y, pow2n);
-  return y;
-}
-#endif
-
-#ifdef __AVX2__
-__m256 ExpAVX2(__m256 x) {
-  AVXEXP_BASE;
-  // two AVX2 instructions
-  imm0 = _mm256_add_epi32(imm0, *reinterpret_cast<const __m256i*>(_pi256_0x7f));
-  imm0 = _mm256_slli_epi32(imm0, 23);
-  __m256 pow2n = _mm256_castsi256_ps(imm0);
-  y = _mm256_mul_ps(y, pow2n);
-  return y;
-}
-#endif
-
-}  // namespace detail
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h
new file mode 100644
index 0000000000..ba5f20e533
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_impl.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <type_traits>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+#define XMM_FLOAT_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define ZMM_FLOAT_BLOCK 16
+
+typedef struct {
+  void* gates;  // gates: W_ch, W_ih, W_fh, W_oh
+  const void* ct_1;
+  void* ct;
+  void* ht;
+  /* weight_peephole and checked data are only used in peephole*/
+  const void* wp{nullptr};
+  void* checked{nullptr};
+} lstm_t;
+
+typedef struct {
+  void* gates;  // gates: {W_update, W_reset; W_state}
+  const void* ht_1;
+  void* ht;
+} gru_t;
+
+struct rnn_attr_s {
+  int d;
+  std::string act_gate, act_cand;
+  rnn_attr_s() = default;
+  rnn_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand)
+      : d(_d), act_gate(_act_gate), act_cand(_act_cand) {}
+};
+
+struct lstm_attr_s : public rnn_attr_s {
+  bool use_peephole;
+  std::string act_cell;
+  lstm_attr_s() = default;
+  lstm_attr_s(int _d, const std::string& _act_gate,
+              const std::string& _act_cand, const std::string& _act_cell,
+              bool _use_peephole = false)
+      : rnn_attr_s(_d, _act_gate, _act_cand),
+        use_peephole(_use_peephole),
+        act_cell(_act_cell) {}
+};
+
+typedef struct rnn_attr_s gru_attr_t;
+typedef struct lstm_attr_s lstm_attr_t;
+
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h
index 8acf60cfbf..5a3efd979f 100644
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
@@ -82,10 +82,10 @@ namespace jitkernel {
 #define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name,     \
                                 marco_declare, macro_find_key, macro_impl) \
   marco_define_name(ker_key, ker_class);                                   \
-  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, JITKERNEL_DECLARE,       \
-                                JITKERNEL_FIND_KEY, JITKERNEL_IMPL);       \
-  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, JITKERNEL_DECLARE,      \
-                                JITKERNEL_FIND_KEY, JITKERNEL_IMPL)
+  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, marco_declare,           \
+                                macro_find_key, macro_impl);               \
+  REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, marco_declare,          \
+                                macro_find_key, macro_impl)
 
 #define REGISTER_JITKERNEL(ker_key, ker_class)                       \
   REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \
diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h
new file mode 100644
index 0000000000..e0b2e3c7fa
--- /dev/null
+++ b/paddle/fluid/operators/math/jit_kernel_refer.h
@@ -0,0 +1,238 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <string>
+#include "paddle/fluid/operators/math/jit_kernel_impl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace jitkernel {
+namespace refer {
+/* Refer code only focus on correctness */
+
+template <typename T>
+void VMul(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] * y[i];
+  }
+}
+
+template <typename T>
+void VAdd(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+  }
+}
+
+template <typename T>
+void VAddRelu(const T* x, const T* y, T* z, int n) {
+  for (int i = 0; i < n; ++i) {
+    z[i] = x[i] + y[i];
+    z[i] = z[i] > 0 ? z[i] : 0;
+  }
+}
+
+template <typename T>
+void VScal(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] * x[i];
+  }
+}
+
+template <typename T>
+void VAddBias(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] + x[i];
+  }
+}
+
+template <typename T>
+void VRelu(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = x[i] > 0 ? x[i] : 0;
+  }
+}
+
+template <typename T>
+inline void VIdentity(const T* x, T* y, int n) {}
+
+template <typename T>
+void VExp(const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = std::exp(x[i]);
+  }
+}
+
+template <typename T>
+void VSigmoid(const T* x, T* y, int n) {
+  // y = 1 / (1 + e^-x)
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int i = 0; i < n; ++i) {
+    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
+    y[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
+  }
+}
+
+template <typename T>
+void VTanh(const T* x, T* y, int n) {
+  // y = 2 * sigmoid(2x) - 1
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * x[i];
+  }
+  VSigmoid(y, y, n);
+  for (int i = 0; i < n; ++i) {
+    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
+  }
+}
+
+template <typename T>
+void (*getActFunc(const std::string& type))(const T*, T*, int) {  // NOLINT
+  if (type == "sigmoid") {
+    return VSigmoid<T>;
+  } else if (type == "relu") {
+    return VRelu<T>;
+  } else if (type == "tanh") {
+    return VTanh<T>;
+  } else if (type == "identity" || type == "") {
+    return VIdentity<T>;
+  }
+  PADDLE_THROW("Not support type: %s", type);
+  return nullptr;
+}
+
+// compute ct and ht
+template <typename T>
+void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* wp = reinterpret_cast<const T*>(step->wp);
+  T* checked = reinterpret_cast<T*>(step->checked);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  auto act_cell = getActFunc<T>(attr->act_cell);
+  int d = attr->d;
+  int d2 = d * 2;
+  int d3 = d * 3;
+  // gates: W_ch, W_ih, W_fh, W_oh
+  if (attr->use_peephole) {
+    VMul(wp, ct_1, checked, d);
+    VMul(wp + d, ct_1, checked + d, d);
+    VAdd(checked, gates + d, gates + d, d2);
+    act_gate(gates + d, gates + d, d2);
+  } else {
+    act_gate(gates + d, gates + d, d3);
+  }
+
+  // C_t = C_t-1 * fgated + cand_gated * igated
+  act_cand(gates, gates, d);
+  VMul(gates, gates + d, gates + d, d);
+  VMul(ct_1, gates + d2, gates + d2, d);
+  VAdd(gates + d, gates + d2, ct, d);
+
+  if (attr->use_peephole) {
+    // get ogated
+    VMul(wp + d2, ct, gates + d, d);
+    VAdd(gates + d, gates + d3, gates + d3, d);
+    act_gate(gates + d3, gates + d3, d);
+  }
+  // H_t = act_cell(C_t) * ogated
+  act_cell(ct, gates + d2, d);
+  VMul(gates + d2, gates + d3, ht, d);
+}
+
+// compute c1 and h1 without c0 or h0
+template <typename T>
+void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ct = reinterpret_cast<T*>(step->ct);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  auto act_cell = getActFunc<T>(attr->act_cell);
+  int d = attr->d;
+  int d2 = d * 2;
+  int d3 = d * 3;
+  /* C_t = igated * cgated*/
+  act_gate(gates + d, gates + d, d);
+  act_cand(gates, gates, d);
+  VMul(gates, gates + d, ct, d);
+  if (attr->use_peephole) {
+    // get outgated, put W_oc * C_t on igated
+    const T* wp = reinterpret_cast<const T*>(step->wp);
+    VMul(wp + d2, ct, gates + d, d);
+    VAdd(gates + d, gates + d3, gates + d3, d);
+  }
+  /* H_t = act_cell(C_t) * ogated */
+  act_gate(gates + d3, gates + d3, d);
+  act_cell(ct, gates + d2, d);
+  VMul(gates + d2, gates + d3, ht, d);
+}
+
+// compute h1 without h0
+template <typename T>
+void GRUH1(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  int d = attr->d;
+  int d2 = d * 2;
+  act_gate(gates, gates, d);
+  act_cand(gates + d2, gates + d2, d);
+  VMul(gates, gates + d2, ht, d);
+}
+
+// compute the first part of GRU: ht = act_gate(r) * ht_1
+template <typename T>
+void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
+  // W: {W_update, W_reset; W_state}
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  act_gate(gates + attr->d, gates + attr->d, attr->d);
+  VMul(ht_1, gates + attr->d, ht, attr->d);
+}
+
+// compute the second part of GRU:
+// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
+template <typename T>
+void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
+  T* gates = reinterpret_cast<T*>(step->gates);
+  T* ht = reinterpret_cast<T*>(step->ht);
+  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
+  auto act_gate = getActFunc<T>(attr->act_gate);
+  auto act_cand = getActFunc<T>(attr->act_cand);
+  int d = attr->d;
+  T* y = gates + d * 2;
+  act_gate(gates, gates, d);
+  act_cand(y, y, d);
+  // out = zt*ht~ + (1-zt)*ht_1
+  for (int i = 0; i < d; ++i) {
+    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
+  }
+}
+
+}  // namespace refer
+}  // namespace jitkernel
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
index e79b0400ab..2db3274a45 100644
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -15,470 +15,248 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/jit_kernel.h"
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
-#ifdef __AVX__
-#include <immintrin.h>
+#ifdef PADDLE_WITH_XBYAK
+#include "paddle/fluid/operators/math/jit_code.h"
 #endif
 
 namespace paddle {
 namespace operators {
 namespace math {
 namespace jitkernel {
-namespace detail {
-#ifdef __AVX__
-__m256 ExpAVX(__m256 x);
-#endif
-
-#ifdef __AVX2__
-__m256 ExpAVX2(__m256 x);
-#endif
-
-}  // namespace detail
-
-namespace jit = platform::jit;
 
-#ifdef __AVX__
-typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type;
-
-class AVXAct {
- public:
-  virtual ~AVXAct() = default;
-  virtual __m256 Compute(__m256 x) const = 0;
-};
-
-template <act_type type, jit::cpu_isa_t isa>
-class AVXActImpl : public AVXAct {
+/* LSTM JitKernel */
+template <typename T>
+class LSTMKernelImpl : public LSTMKernel<T> {
  public:
-  __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); }
-};
-
-#define AVX_SIGMOID(isa, expisa)                                 \
-  template <>                                                    \
-  __m256 AVXActImpl<kSigmoid, isa>::Compute(__m256 x) const {    \
-    __m256 ones = _mm256_set1_ps(1.0f);                          \
-    x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); \
-    x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); \
-    x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x);                  \
-    x = expisa(x);                                               \
-    x = _mm256_add_ps(ones, x);                                  \
-    return _mm256_div_ps(ones, x);                               \
-  }
-
-#define AVX_TANH(isa, expisa)                              \
-  template <>                                              \
-  __m256 AVXActImpl<kTanh, isa>::Compute(__m256 x) const { \
-    __m256 ones = _mm256_set1_ps(1.0f);                    \
-    x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x);           \
-    x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT));   \
-    x = expisa(x);                                         \
-    x = _mm256_add_ps(ones, x);                            \
-    x = _mm256_div_ps(_mm256_set1_ps(2.0f), x);            \
-    return _mm256_sub_ps(x, ones);                         \
-  }
-
-#define AVX_RELU(isa)                                      \
-  template <>                                              \
-  __m256 AVXActImpl<kRelu, isa>::Compute(__m256 x) const { \
-    return _mm256_max_ps(x, _mm256_setzero_ps());          \
+  static inline std::string name(const lstm_attr_t& attr) {
+    PADDLE_THROW("DType should be either float or double");
   }
+  static inline bool useJIT(int d) { return false; }
+  static inline bool useMKL(int d) { return false; }
+  explicit LSTMKernelImpl(const lstm_attr_t& attr) : LSTMKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(attr.d)) {
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8;
+      jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096));
+      this->ComputeCtHt =
+          jitcode0_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+
+      jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096));
+      this->ComputeC1H1 =
+          jitcode1_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+      return;
+    }
+#endif
 
-#define AVX_IDENTITY(isa)                                      \
-  template <>                                                  \
-  __m256 AVXActImpl<kIdentity, isa>::Compute(__m256 x) const { \
-    return x;                                                  \
+    this->ComputeCtHt = refer::LSTMCtHt<T>;
+    this->ComputeC1H1 = refer::LSTMC1H1<T>;
   }
 
-#define FOR_EACH_AVX_ISA(macro_) \
-  macro_(jit::avx);              \
-  macro_(jit::avx2);             \
-  macro_(jit::avx512f)
-
-FOR_EACH_AVX_ISA(AVX_RELU);
-FOR_EACH_AVX_ISA(AVX_IDENTITY);
-
-AVX_SIGMOID(jit::avx, detail::ExpAVX);
-AVX_TANH(jit::avx, detail::ExpAVX);
+#ifdef PADDLE_WITH_XBYAK
 
-#ifdef __AVX2__
-AVX_SIGMOID(jit::avx2, detail::ExpAVX2);
-AVX_SIGMOID(jit::avx512f, detail::ExpAVX2);
-AVX_TANH(jit::avx2, detail::ExpAVX2);
-AVX_TANH(jit::avx512f, detail::ExpAVX2);
+ private:
+  std::unique_ptr<gen::LSTMJitCode> jitcode0_{nullptr}, jitcode1_{nullptr};
 #endif
+};
 
-#undef FOR_EACH_AVX_ISA
-#undef AVX_IDENTITY
-#undef AVX_RELU
-#undef AVX_TANH
-#undef AVX_SIGMOID
-
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool LSTMKernelImpl<float>::useJIT(int d) {
+  return gen::LSTMJitCode::init(d);
+}
 #endif
 
+/* Peephole JitKernel */
 template <typename T>
-static std::shared_ptr<const VActKernel<T>> GetActKernel(
-    const std::string& type, int n) {
-  if (type == "sigmoid") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VSigmoidKernel<T>>(n));
-  } else if (type == "relu") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VReluKernel<T>>(n));
-  } else if (type == "tanh") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VTanhKernel<T>>(n));
-  } else if (type == "identity" || type == "") {
-    return std::dynamic_pointer_cast<const VActKernel<T>>(
-        KernelPool::Instance().template Get<VIdentityKernel<T>>(n));
-  }
-  PADDLE_THROW("Not support type: %s", type);
-  return nullptr;
-}
-
-#ifdef __AVX__
-template <jit::cpu_isa_t isa>
-static std::unique_ptr<AVXAct> GetAVXAct(const std::string& type) {
-  if (type == "sigmoid") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid, isa>());
-  } else if (type == "relu") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu, isa>());
-  } else if (type == "tanh") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh, isa>());
-  } else if (type == "identity" || type == "") {
-    return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity, isa>());
+class PeepholeKernelImpl : public LSTMKernel<T> {
+ public:
+  static inline std::string name(const lstm_attr_t& attr) {
+    PADDLE_THROW("DType should be either float or double");
   }
-  PADDLE_THROW("Not support type: %s", type);
-  return nullptr;
-}
+  static inline bool useJIT(int d) { return false; }
+  static inline bool useMKL(int d) { return false; }
+  explicit PeepholeKernelImpl(const lstm_attr_t& attr) : LSTMKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(attr.d)) {
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 4 * 8;
+      jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096));
+      this->ComputeCtHt =
+          jitcode0_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+
+      jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096));
+      this->ComputeC1H1 =
+          jitcode1_->getCode<void (*)(lstm_t*, const lstm_attr_t*)>();
+      return;
+    }
 #endif
 
-/* LSTM JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
-class LSTMKernelImpl : public LSTMKernel<T> {
- public:
-  explicit LSTMKernelImpl(const std::string& act_gate,
-                          const std::string& act_cand,
-                          const std::string& act_cell, int d)
-      : LSTMKernel<T>() {
-    d_ = d;
-    d2_ = d * 2;
-    d3_ = d * 3;
-    act_gate_d3_ = GetActKernel<T>(act_gate, d3_);
-    act_gate_d_ = GetActKernel<T>(act_gate, d);
-    act_cand_d_ = GetActKernel<T>(act_cand, d);
-    act_cell_d_ = GetActKernel<T>(act_cell, d);
-    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
-    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
+    this->ComputeCtHt = refer::LSTMCtHt<T>;
+    this->ComputeC1H1 = refer::LSTMC1H1<T>;
   }
 
-  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
-                   T* checked) const override {
-    // gates: W_ch, W_ih, W_fh, W_oh
-    act_gate_d3_->Compute(gates + d_, gates + d_, d3_);
-
-    /* C_t = C_t-1 * fgated + cand_gated * igated */
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
-    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
-    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
-
-    /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
-  }
-  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
-    /* C_t = igated * cgated*/
-    act_gate_d_->Compute(gates + d_, gates + d_, d_);
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, ct, d_);
-    /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
-  }
+#ifdef PADDLE_WITH_XBYAK
 
  private:
-  int d_, d2_, d3_;
-  std::shared_ptr<const VActKernel<T>> act_gate_d3_, act_gate_d_, act_cand_d_,
-      act_cell_d_;
-  std::shared_ptr<const VMulKernel<T>> vmul_d_;
-  std::shared_ptr<const VAddKernel<T>> vadd_d_;
-#ifdef __AVX__
-  std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_cand_, avx_act_cell_;
+  std::unique_ptr<gen::LSTMJitCode> jitcode0_{nullptr}, jitcode1_{nullptr};
 #endif
 };
 
-#define INTRI8_FLOAT(isa)                                                    \
-  template <>                                                                \
-  LSTMKernelImpl<float, isa, kEQ8>::LSTMKernelImpl(                          \
-      const std::string& act_gate, const std::string& act_cand,              \
-      const std::string& act_cell, int d)                                    \
-      : LSTMKernel<float>() {                                                \
-    avx_act_gate_ = GetAVXAct<isa>(act_gate);                                \
-    avx_act_cand_ = GetAVXAct<isa>(act_cand);                                \
-    avx_act_cell_ = GetAVXAct<isa>(act_cell);                                \
-  }                                                                          \
-  template <>                                                                \
-  void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt(                        \
-      float* gates, const float* ct_1, float* ct, float* ht,                 \
-      const float* wp_data, float* checked) const {                          \
-    /* gates: W_ch, W_ih, W_fh, W_oh */                                      \
-    __m256 c, i, f, o;                                                       \
-    c = _mm256_loadu_ps(gates);                                              \
-    i = _mm256_loadu_ps(gates + 8);                                          \
-    f = _mm256_loadu_ps(gates + 16);                                         \
-    o = _mm256_loadu_ps(gates + 24);                                         \
-    /* C_t = C_t-1 * fgated + cand_gated * igated*/                          \
-    c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \
-    i = _mm256_loadu_ps(ct_1);                                               \
-    f = _mm256_mul_ps(i, avx_act_gate_->Compute(f));                         \
-    f = _mm256_add_ps(c, f);                                                 \
-    _mm256_storeu_ps(ct, f);                                                 \
-    /* H_t = act_cell(C_t) * ogated */                                       \
-    o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \
-    _mm256_storeu_ps(ht, o);                                                 \
-  }                                                                          \
-  template <>                                                                \
-  void LSTMKernelImpl<float, isa, kEQ8>::ComputeC1H1(                        \
-      float* gates, float* ct, float* ht, const float* wp_data) const {      \
-    __m256 c, i, o;                                                          \
-    c = _mm256_loadu_ps(gates);                                              \
-    i = _mm256_loadu_ps(gates + 8);                                          \
-    o = _mm256_loadu_ps(gates + 24);                                         \
-    /* C_t = igated * cgated*/                                               \
-    c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \
-    _mm256_storeu_ps(ct, c);                                                 \
-    /* H_t = act_cell(C_t) * ogated */                                       \
-    o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \
-    _mm256_storeu_ps(ht, o);                                                 \
-  }
-
-// TODO(TJ): optimize keq16
-
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-#endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f);
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool PeepholeKernelImpl<float>::useJIT(int d) {
+  return gen::LSTMJitCode::init(d);
+}
 #endif
 
-/* Peephole JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
-class PeepholeKernelImpl : public LSTMKernel<T> {
- public:
-  explicit PeepholeKernelImpl(const std::string& act_gate,
-                              const std::string& act_cand,
-                              const std::string& act_cell, int d)
-      : LSTMKernel<T>() {
-    d_ = d;
-    d2_ = d * 2;
-    d3_ = d * 3;
-    act_gate_d_ = GetActKernel<T>(act_gate, d);
-    act_cand_d_ = GetActKernel<T>(act_cand, d);
-    act_cell_d_ = GetActKernel<T>(act_cell, d);
-    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
-    vadd_d_ = KernelPool::Instance().template Get<VAddKernel<T>>(d);
-    vadd_d2_ = KernelPool::Instance().template Get<VAddKernel<T>>(d2_);
-    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
+#define JITKERNEL_DEFINE_NAME_LSTM(ker_key, ker_class)                 \
+  template <>                                                          \
+  std::string ker_class##Impl<float>::name(const lstm_attr_t& attr) {  \
+    std::string key(#ker_key "f");                                     \
+    key += (attr.act_gate + attr.act_cand + attr.act_cell +            \
+            (attr.use_peephole ? "p" : "n"));                          \
+    if (useJIT(attr.d)) {                                              \
+      /* only jit code need record d*/                                 \
+      return key + "jit" + std::to_string(attr.d);                     \
+    } else if (useMKL(attr.d)) {                                       \
+      return key + "mkl";                                              \
+    } else {                                                           \
+      return key + "any";                                              \
+    }                                                                  \
+  }                                                                    \
+  template <>                                                          \
+  std::string ker_class##Impl<double>::name(const lstm_attr_t& attr) { \
+    std::string key(#ker_key "d");                                     \
+    /* jit code do not support double yet*/                            \
+    if (useMKL(attr.d)) {                                              \
+      return key + "mkl";                                              \
+    } else {                                                           \
+      return key + "any";                                              \
+    }                                                                  \
   }
 
-  void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data,
-                   T* checked) const override {
-    /* get fgated and igated*/
-    vmul_d_->Compute(wp_data, ct_1, checked, d_);
-    vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_);
-    vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_);
-    act_gate_d2_->Compute(gates + d_, gates + d_, d2_);
-    /* C_t = C_t-1 * fgated + cand_gated * igated*/
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
-    vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
-    vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
-    /* get ogated*/
-    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
-    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
-    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    /* H_t = act_cell(C_t) * ogated */
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
+#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype)          \
+  template <>                                                 \
+  std::shared_ptr<const LSTMKernel<ker_dtype>>                \
+  KernelPool::Get<LSTMKernel<ker_dtype>, const lstm_attr_t&>( \
+      const lstm_attr_t& attr)
+
+#define JITKERNEL_FIND_KEY_LSTM(ker_class, ker_dtype) \
+  std::string key = ker_class##Impl<ker_dtype>::name(attr)
+
+#define JITKERNEL_LSTM_IMPL(ker, dtype)                     \
+  if (attr.use_peephole) {                                  \
+    p = std::dynamic_pointer_cast<ker<dtype>>(              \
+        std::make_shared<PeepholeKernelImpl<dtype>>(attr)); \
+  } else {                                                  \
+    p = std::dynamic_pointer_cast<ker<dtype>>(              \
+        std::make_shared<ker##Impl<dtype>>(attr));          \
   }
 
-  void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override {
-    /* C_t = igated * cgated*/
-    act_gate_d_->Compute(gates + d_, gates + d_, d_);
-    act_cand_d_->Compute(gates, gates, d_);
-    vmul_d_->Compute(gates, gates + d_, ct, d_);
-    /* get outgated, put W_oc * C_t on igated */
-    vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
-    vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
-    /* H_t = act_cell(C_t) * ogated */
-    act_gate_d_->Compute(gates + d3_, gates + d3_, d_);
-    act_cell_d_->Compute(ct, gates + d2_, d_);
-    vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_);
-  }
+REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DEFINE_NAME_LSTM,
+                        JITKERNEL_DECLARE_LSTM, JITKERNEL_FIND_KEY_LSTM,
+                        JITKERNEL_LSTM_IMPL);
 
- private:
-  int d_, d2_, d3_;
-  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_cand_d_,
-      act_cell_d_;
-  std::shared_ptr<const VMulKernel<T>> vmul_d_;
-  std::shared_ptr<const VAddKernel<T>> vadd_d_, vadd_d2_;
-};
-
-#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype)                  \
-  template <>                                                         \
-  std::shared_ptr<const LSTMKernel<ker_dtype>>                        \
-  KernelPool::Get<LSTMKernel<ker_dtype>, const std::string&,          \
-                  const std::string&, const std::string&, int, bool>( \
-      const std::string& act_gate, const std::string& act_cand,       \
-      const std::string& act_cell, int d, bool use_peephole)
-
-#define JITKERNEL_KEY_LSTM(ker_key, dtype_key)                               \
-  #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \
-                                       (use_peephole ? "p" : "n")
-
-#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k)                    \
-  if (use_peephole) {                                                  \
-    p = std::dynamic_pointer_cast<ker<dtype>>(                         \
-        std::make_shared<PeepholeKernelImpl<dtype, isa, k>>(           \
-            act_gate, act_cand, act_cell, d));                         \
-  } else {                                                             \
-    p = std::dynamic_pointer_cast<ker<dtype>>(                         \
-        std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_cand, \
-                                                   act_cell, d));      \
-  }
-
-REGISTER_JITKERNEL_ARGS_DEPRECATED(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM,
-                                   JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL);
-
-#undef INTRI8_FLOAT
+#undef JITKERNEL_LSTM_IMPL
+#undef JITKERNEL_FIND_KEY_LSTM
 #undef JITKERNEL_DECLARE_LSTM
-#undef JITKERNEL_KEY_LSTM
-#undef JITKERNEL_NEW_LSTM_IMPL
+#undef JITKERNEL_DEFINE_NAME_LSTM
 
 /* GRU JitKernel */
-template <typename T, jit::cpu_isa_t isa, jit_block>
+template <typename T>
 class GRUKernelImpl : public GRUKernel<T> {
  public:
-  explicit GRUKernelImpl(const std::string& act_gate,
-                         const std::string& act_state, int d)
-      : GRUKernel<T>() {
-    d_ = d;
-    d2_ = d * 2;
-    act_gate_d2_ = GetActKernel<T>(act_gate, d2_);
-    act_gate_d_ = GetActKernel<T>(act_gate, d);
-    act_state_d_ = GetActKernel<T>(act_state, d);
-    vmul_d_ = KernelPool::Instance().template Get<VMulKernel<T>>(d);
-  }
-
-  void ComputeH1(T* gates, T* ht) const override {
-    act_gate_d_->Compute(gates, gates, d_);
-    act_state_d_->Compute(gates + d2_, gates + d2_, d_);
-    vmul_d_->Compute(gates, gates + d2_, ht, d_);
+  static inline std::string name(const gru_attr_t& attr) {
+    PADDLE_THROW("DType should be either float or double");
   }
-
-  void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override {
-    // W: {W_update, W_reset; W_state}
-    act_gate_d2_->Compute(gates, gates, d2_);
-    vmul_d_->Compute(ht_1, gates + d_, ht, d_);
-  }
-
-  void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override {
-    T* y = gates + d2_;
-    act_state_d_->Compute(y, y, d_);
-    // out = zt*ht~ + (1-zt)*ht_1
-    for (int i = 0; i < d_; ++i) {
-      ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
+  static inline bool useJIT(int d) { return false; }
+  static inline bool useMKL(int d) { return false; }
+  explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel<T>() {
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(attr.d)) {
+      size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8;
+      jitcode0_.reset(new gen::GRUJitCode(0, attr, sz > 4096 ? sz : 4096));
+      this->ComputeH1 =
+          jitcode0_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
+
+      jitcode1_.reset(new gen::GRUJitCode(1, attr, sz > 4096 ? sz : 4096));
+      this->ComputeHtPart1 =
+          jitcode1_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
+
+      jitcode2_.reset(new gen::GRUJitCode(2, attr, sz > 4096 ? sz : 4096));
+      this->ComputeHtPart2 =
+          jitcode2_->getCode<void (*)(gru_t*, const gru_attr_t*)>();
+      return;
     }
+#endif
+    this->ComputeH1 = refer::GRUH1<T>;
+    this->ComputeHtPart1 = refer::GRUHtPart1<T>;
+    this->ComputeHtPart2 = refer::GRUHtPart2<T>;
   }
+#ifdef PADDLE_WITH_XBYAK
 
  private:
-  int d_, d2_;
-  std::shared_ptr<const VActKernel<T>> act_gate_d2_, act_gate_d_, act_state_d_;
-  std::shared_ptr<const VMulKernel<T>> vmul_d_;
-#ifdef __AVX__
-  std::unique_ptr<const AVXAct> avx_act_gate_, avx_act_state_;
+  std::unique_ptr<gen::GRUJitCode> jitcode0_{nullptr}, jitcode1_{nullptr},
+      jitcode2_{nullptr};
 #endif
 };
 
-#define INTRI8_FLOAT(isa)                                                     \
-  template <>                                                                 \
-  GRUKernelImpl<float, isa, kEQ8>::GRUKernelImpl(                             \
-      const std::string& act_gate, const std::string& act_state, int d)       \
-      : GRUKernel<float>() {                                                  \
-    avx_act_gate_ = GetAVXAct<isa>(act_gate);                                 \
-    avx_act_state_ = GetAVXAct<isa>(act_state);                               \
-  }                                                                           \
-  template <>                                                                 \
-  void GRUKernelImpl<float, isa, kEQ8>::ComputeH1(float* gates, float* ht)    \
-      const {                                                                 \
-    __m256 u, s;                                                              \
-    /* W: {W_update, W_reset; W_state} */                                     \
-    u = _mm256_loadu_ps(gates);                                               \
-    s = _mm256_loadu_ps(gates + 16);                                          \
-    s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \
-    _mm256_storeu_ps(ht, s);                                                  \
-  }                                                                           \
-  template <>                                                                 \
-  void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart1(                       \
-      float* gates, const float* ht_1, float* ht) const {                     \
-    /* not exactly equal the any implementation */                            \
-    __m256 r, ht0;                                                            \
-    r = _mm256_loadu_ps(gates + 8);                                           \
-    ht0 = _mm256_loadu_ps(ht_1);                                              \
-    r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0);                        \
-    _mm256_storeu_ps(ht, r);                                                  \
-  }                                                                           \
-  template <>                                                                 \
-  void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart2(                       \
-      float* gates, const float* ht_1, float* ht) const {                     \
-    /* not exactly equal the any implementation */                            \
-    __m256 u, s, ht0;                                                         \
-    u = _mm256_loadu_ps(gates);                                               \
-    s = _mm256_loadu_ps(gates + 16);                                          \
-    ht0 = _mm256_loadu_ps(ht_1);                                              \
-    u = avx_act_gate_->Compute(u);                                            \
-    s = _mm256_mul_ps(u, avx_act_state_->Compute(s));                         \
-    u = _mm256_sub_ps(_mm256_set1_ps(1.f), u);                                \
-    u = _mm256_mul_ps(u, ht0);                                                \
-    u = _mm256_add_ps(s, u);                                                  \
-    _mm256_storeu_ps(ht, u);                                                  \
-  }
-
-#ifdef __AVX__
-INTRI8_FLOAT(jit::avx);
-#endif
-#ifdef __AVX2__
-INTRI8_FLOAT(jit::avx2);
-#endif
-#ifdef __AVX512F__
-INTRI8_FLOAT(jit::avx512f);
+#ifdef PADDLE_WITH_XBYAK
+template <>
+bool GRUKernelImpl<float>::useJIT(int d) {
+  return gen::GRUJitCode::init(d);
+}
 #endif
 
-#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype)                       \
-  template <>                                                             \
-  std::shared_ptr<const GRUKernel<ker_dtype>> KernelPool::Get<            \
-      GRUKernel<ker_dtype>, const std::string&, const std::string&, int>( \
-      const std::string& act_gate, const std::string& act_state, int d)
+#define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class)                 \
+  template <>                                                         \
+  std::string ker_class##Impl<float>::name(const gru_attr_t& attr) {  \
+    std::string key(#ker_key "f");                                    \
+    key += (attr.act_gate + attr.act_cand);                           \
+    if (useJIT(attr.d)) {                                             \
+      /* only jit code need record d*/                                \
+      return key + "jit" + std::to_string(attr.d);                    \
+    } else if (useMKL(attr.d)) {                                      \
+      return key + "mkl";                                             \
+    } else {                                                          \
+      return key + "any";                                             \
+    }                                                                 \
+  }                                                                   \
+  template <>                                                         \
+  std::string ker_class##Impl<double>::name(const gru_attr_t& attr) { \
+    std::string key(#ker_key "d");                                    \
+    /* jit code do not support double yet*/                           \
+    if (useMKL(attr.d)) {                                             \
+      return key + "mkl";                                             \
+    } else {                                                          \
+      return key + "any";                                             \
+    }                                                                 \
+  }
+
+#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype)         \
+  template <>                                               \
+  std::shared_ptr<const ker_class<ker_dtype>>               \
+  KernelPool::Get<ker_class<ker_dtype>, const gru_attr_t&>( \
+      const gru_attr_t& attr)
 
-#define JITKERNEL_KEY_GRU(ker_key, dtype_key) \
-  #ker_key #dtype_key + std::to_string(d) + act_gate + act_state
+#define JITKERNEL_FIND_KEY_GRU(ker_class, ker_dtype) \
+  std::string key = ker_class##Impl<ker_dtype>::name(attr)
 
-#define JITKERNEL_NEW_GRU_IMPL(ker, dtype, isa, k) \
-  p = std::dynamic_pointer_cast<ker<dtype>>(       \
-      std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_state, d));
+#define JITKERNEL_GRU_IMPL(ker, dtype)       \
+  p = std::dynamic_pointer_cast<ker<dtype>>( \
+      std::make_shared<ker##Impl<dtype>>(attr));
 
-REGISTER_JITKERNEL_ARGS_DEPRECATED(gru, GRUKernel, JITKERNEL_DECLARE_GRU,
-                                   JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL);
+REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DEFINE_NAME_GRU,
+                        JITKERNEL_DECLARE_GRU, JITKERNEL_FIND_KEY_GRU,
+                        JITKERNEL_GRU_IMPL);
 
-#undef INTRI8_FLOAT
-#undef JITKERNEL_NEW_GRU_IMPL
-#undef JITKERNEL_KEY_GRU
+#undef JITKERNEL_GRU_IMPL
+#undef JITKERNEL_FIND_KEY_GRU
 #undef JITKERNEL_DECLARE_GRU
+#undef JITKERNEL_DEFINE_NAME_GRU
 }  // namespace jitkernel
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index b6c62a2634..ed86a47e15 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/jit_kernel.h"
-#include <sys/time.h>
 #include <cmath>    // for exp
 #include <cstring>  // for memcpy
 #include <random>
@@ -22,6 +21,8 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/operators/math/jit_kernel_refer.h"
+#include "paddle/fluid/platform/port.h"
 
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
@@ -53,12 +54,6 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
   }
 }
 
-void vrelu_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0.f ? x[i] : 0.f;
-  }
-}
-
 #if defined __AVX__ || defined __AVX2__
 void vrelu_intri8(const int n, const float* x, float* y) {
   __m256 tmp = _mm256_loadu_ps(x);
@@ -69,6 +64,7 @@ void vrelu_intri8(const int n, const float* x, float* y) {
 
 TEST(JitKernel, vrelu) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
@@ -80,7 +76,7 @@ TEST(JitKernel, vrelu) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vrelu_ref(d, x_data, zref_data);
+      refer::VRelu<float>(x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
 #if defined __AVX__ || defined __AVX2__
@@ -90,7 +86,7 @@ TEST(JitKernel, vrelu) {
         vrelu_intri8(d, x_data, zref_data);
       }
       auto si1 = GetCurrentUS();
-      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat << " us";
     }
 #endif
     auto ttgts = GetCurrentUS();
@@ -98,23 +94,17 @@ TEST(JitKernel, vrelu) {
       ker->Compute(x_data, ztgt_data, d);
     }
     auto ttgte = GetCurrentUS();
-    VLOG(30) << "Vec size " << d
-             << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
   }
 }
 
-void vaddbias_ref(const int n, const float a, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] + a;
-  }
-}
-
 TEST(JitKernel, vaddbias) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
@@ -127,7 +117,7 @@ TEST(JitKernel, vaddbias) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vaddbias_ref(d, a, x_data, zref_data);
+      refer::VAddBias<float>(&a, x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
@@ -136,21 +126,14 @@ TEST(JitKernel, vaddbias) {
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(30) << "Vec size " << d
-             << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
   }
 }
 
-void vexp_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
 #ifdef PADDLE_WITH_MKLML
 void vexp_mkl(const int n, const float* x, float* y) {
   paddle::platform::dynload::vsExp(n, x, y);
@@ -159,6 +142,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
 
 TEST(JitKernel, vexp) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
@@ -170,7 +154,7 @@ TEST(JitKernel, vexp) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vexp_ref(d, x_data, zref_data);
+      refer::VExp<float>(x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
 
@@ -189,33 +173,20 @@ TEST(JitKernel, vexp) {
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(30) << "Vec size " << d
-             << ": refer takes: " << (trefe - trefs) / repeat
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
 #ifdef PADDLE_WITH_MKLML
-             << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
 #else
-             << " us, "
+            << " us, "
 #endif
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+
+            << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
   }
 }
 
-inline float _sigmoid(float x) {
-  const float min = SIGMOID_THRESHOLD_MIN;
-  const float max = SIGMOID_THRESHOLD_MAX;
-  float tmp = (x < min) ? min : ((x > max) ? max : x);
-  return 1.f / (1.f + std::exp(-tmp));
-}
-
-void vsigmoid_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = _sigmoid(x[i]);
-  }
-}
-
 void vsigmoid_better(
     const std::shared_ptr<
         const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp,
@@ -234,6 +205,7 @@ void vsigmoid_better(
 
 TEST(JitKernel, vsigmoid) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
@@ -252,7 +224,7 @@ TEST(JitKernel, vsigmoid) {
     auto tmkle = GetCurrentUS();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vsigmoid_ref(d, x_data, zref_data);
+      refer::VSigmoid<float>(x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
@@ -261,24 +233,15 @@ TEST(JitKernel, vsigmoid) {
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(30) << "Vec size " << d
-             << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
   }
 }
 
-inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; }
-
-void vtanh_ref(const int n, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = _tanh(x[i]);
-  }
-}
-
 void vtanh_better(
     const std::shared_ptr<
         const paddle::operators::math::jitkernel::VScalKernel<float>>& vscal,
@@ -298,6 +261,7 @@ void vtanh_better(
 
 TEST(JitKernel, vtanh) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
     std::vector<float> x(d);
     std::vector<float> zref(d), ztgt(d);
@@ -320,7 +284,7 @@ TEST(JitKernel, vtanh) {
     auto tmkle = GetCurrentUS();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vtanh_ref(d, x_data, zref_data);
+      refer::VTanh<float>(x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
@@ -329,42 +293,15 @@ TEST(JitKernel, vtanh) {
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(30) << "Vec size " << d
-             << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better(jit exp) takes: " << (tmkle - tmkls) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
   }
 }
 
-void lstm_ctht_ref(
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
-        vsigmoid_3d,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VTanhKernel<float>>& vtanh_d,
-    const std::shared_ptr<
-        const paddle::operators::math::jitkernel::VExpKernel<float>>& vexp_1,
-    const int d, float* gates, const float* ct_1, float* ct, float* ht) {
-  vsigmoid_3d->Compute(gates + d, gates + d, 3 * d);
-  vtanh_d->Compute(gates, gates, d);
-  const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3;
-  const float min = SIGMOID_THRESHOLD_MIN;
-  const float max = SIGMOID_THRESHOLD_MAX;
-  for (int k = 0; k < d; ++k) {
-    // C_t = C_t-1 * fgated + cand_gated * igated
-    ct[k] = ct_1[k] * f[k] + gates[k] * i[k];
-    // H_t = act_cell(C_t) * ogated
-    float tmp = ct[k] * 2;
-    tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
-    vexp_1->Compute(&tmp, &tmp, 1);
-    tmp = 2.f / (1.f + tmp) - 1.f;
-    ht[k] = tmp * o[k];
-  }
-}
-
 void lstm_ctht_better(
     const std::shared_ptr<
         const paddle::operators::math::jitkernel::VSigmoidKernel<float>>&
@@ -389,6 +326,7 @@ void lstm_ctht_better(
 
 TEST(JitKernel, lstm) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) {
     int d4 = d * 4;
     int d3 = d * 3;
@@ -399,19 +337,17 @@ TEST(JitKernel, lstm) {
     RandomVec<float>(d, ct_1.data(), -2.f, 2.f);
     memcpy(xref.data(), x.data(), sizeof(float) * d4);
     std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
+    const jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell, false);
     const auto& ker =
         jit::KernelPool::Instance()
-            .template Get<jit::LSTMKernel<float>, const std::string&,
-                          const std::string&, const std::string&>(
-                act_gate, act_cand, act_cell, d, false);
+            .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(
+                attr);
     // below kernels are used to compute refer
     const auto& vsigmoid_3d =
         jit::KernelPool::Instance().template Get<jit::VSigmoidKernel<float>>(
             d3);
     const auto& vtanh_d =
         jit::KernelPool::Instance().template Get<jit::VTanhKernel<float>>(d);
-    const auto& vexp_1 =
-        jit::KernelPool::Instance().template Get<jit::VExpKernel<float>>(1);
     const auto& vmul_d =
         jit::KernelPool::Instance().template Get<jit::VMulKernel<float>>(d);
     const auto& vadd_d =
@@ -425,9 +361,17 @@ TEST(JitKernel, lstm) {
     float* ct_ref_data = ct_ref.data();
     float* ht_ref_data = ht_ref.data();
     // compute once to check correctness
-    lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
-                  ct_ref_data, ht_ref_data);
-    ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
+    jit::lstm_t step;
+    step.gates = xref_data;
+    step.ct_1 = ct_1_data;
+    step.ct = ct_ref_data;
+    step.ht = ht_ref_data;
+    refer::LSTMCtHt<float>(&step, &attr);
+
+    step.gates = x_data;
+    step.ct = ct_tgt_data;
+    step.ht = ht_tgt_data;
+    ker->ComputeCtHt(&step, &attr);
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3);
       EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3);
@@ -441,32 +385,20 @@ TEST(JitKernel, lstm) {
     auto tmkle = GetCurrentUS();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data,
-                    ct_ref_data, ht_ref_data);
+      refer::LSTMCtHt<float>(&step, &attr);
     }
     auto trefe = GetCurrentUS();
     auto ttgts = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data);
+      ker->ComputeCtHt(&step, &attr);
     }
     auto ttgte = GetCurrentUS();
-    VLOG(30) << "Vec size " << d
-             << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, better(jit) takes: " << (tmkle - tmkls) / repeat
-             << " us, tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better(jit) takes: " << (tmkle - tmkls) / repeat
+            << " us, tgt takes: " << (ttgte - ttgts) / repeat << " us";
   }
 }
 
-void vscal_ref(const int n, const float a, const float* x, float* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a * x[i];
-  }
-}
-void vscal_inp_ref(const int n, const float a, float* x) {
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-}
 #if defined __AVX__ || defined __AVX2__
 void vscal_intri8(const int n, const float a, const float* x, float* y) {
   __m256 tmp;
@@ -492,6 +424,7 @@ void vscal_inp_mkl(const int n, const float a, float* x) {
 
 TEST(JitKernel, vscal) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {7, 8, 15, 16, 30, 256, 512}) {
     std::vector<float> x(d), y(d);
     std::vector<float> zref(d), ztgt(d);
@@ -506,12 +439,12 @@ TEST(JitKernel, vscal) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vscal_ref(d, a, x_data, zref_data);
+      refer::VScal<float>(&a, x_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
     auto trefs1 = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vscal_inp_ref(d, a, y_data);
+      refer::VScal<float>(&a, y_data, y_data, d);
     }
     auto trefe1 = GetCurrentUS();
 
@@ -535,8 +468,8 @@ TEST(JitKernel, vscal) {
         vscal_inp_intri8(d, a, y_data);
       }
       auto si3 = GetCurrentUS();
-      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
-               << " us, inplace: " << (si3 - si2) / repeat;
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat
+              << " us, inplace: " << (si3 - si2) / repeat << " us";
     }
 #endif
 
@@ -550,29 +483,21 @@ TEST(JitKernel, vscal) {
       ker->Compute(&a, y_data, y_data, d);
     }
     auto ttgte1 = GetCurrentUS();
-    VLOG(30) << "Vec size " << d
-             << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, inplace takes: " << (trefe1 - trefs1) / repeat
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, inplace takes: " << (trefe1 - trefs1) / repeat
 #ifdef PADDLE_WITH_MKLML
-             << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat
-             << " us, "
+            << " us, mkl inplace takes: " << (tmkle - tmkls) / repeat << " us, "
 #else
-             << " us, "
+            << " us, "
 #endif
-             << "tgt takes: " << (ttgte - ttgts) / repeat
-             << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat;
+            << "tgt takes: " << (ttgte - ttgts) / repeat
+            << "us, tgt inplace takes: " << (ttgte1 - ttgts1) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
   }
 }
 
-void vmul_ref(const int n, const float* x, const float* y, float* z) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
 #if defined __AVX__ || defined __AVX2__
 void vmul_intri8(const int n, const float* x, const float* y, float* z) {
   __m256 tmpx, tmpy;
@@ -591,6 +516,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) {
 
 TEST(JitKernel, vmul) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {7, 8, 15, 16, 20, 30, 256, 512, 1000, 1024}) {
     std::vector<float> x(d), y(d);
     std::vector<float> zref(d), ztgt(d);
@@ -604,7 +530,7 @@ TEST(JitKernel, vmul) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vmul_ref(d, x_data, y_data, zref_data);
+      refer::VMul<float>(x_data, y_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
 
@@ -623,7 +549,7 @@ TEST(JitKernel, vmul) {
         vmul_intri8(d, x_data, y_data, zref_data);
       }
       auto si1 = GetCurrentUS();
-      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
     }
 #endif
 
@@ -633,26 +559,19 @@ TEST(JitKernel, vmul) {
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(30) << "Vec size " << d
-             << ": refer takes: " << (trefe - trefs) / repeat
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
 #ifdef PADDLE_WITH_MKLML
-             << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
 #else
-             << " us, "
+            << " us, "
 #endif
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+            << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
   }
 }
 
-void vadd_ref(const int n, const float* x, const float* y, float* z) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-  }
-}
-
 #if defined __AVX__ || defined __AVX2__
 void vadd_intri8(const int n, const float* x, const float* y, float* z) {
   __m256 tmpx, tmpy;
@@ -671,6 +590,7 @@ void vadd_mkl(const int n, const float* x, const float* y, float* z) {
 
 TEST(JitKernel, vadd) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {7, 8, 15, 16, 30, 256, 512}) {
     std::vector<float> x(d), y(d);
     std::vector<float> zref(d), ztgt(d);
@@ -684,7 +604,7 @@ TEST(JitKernel, vadd) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vadd_ref(d, x_data, y_data, zref_data);
+      refer::VAdd<float>(x_data, y_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
 
@@ -703,7 +623,7 @@ TEST(JitKernel, vadd) {
         vadd_intri8(d, x_data, y_data, zref_data);
       }
       auto si1 = GetCurrentUS();
-      VLOG(30) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
+      VLOG(3) << "Vec size 8 intr takes: " << (si1 - si0) / repeat;
     }
 #endif
 
@@ -713,26 +633,19 @@ TEST(JitKernel, vadd) {
     }
     auto ttgte = GetCurrentUS();
 
-    VLOG(30) << "Vec size " << d
-             << ": refer takes: " << (trefe - trefs) / repeat
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
 #ifdef PADDLE_WITH_MKLML
-             << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
+            << " us, mkl takes: " << (tmkle - tmkls) / repeat << " us, "
 #else
-             << " us, "
+            << " us, "
 #endif
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+            << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
   }
 }
 
-void vaddrelu_ref(const int n, const float* x, const float* y, float* z) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-    z[i] = z[i] > 0 ? z[i] : 0;
-  }
-}
 void vaddrelu_better(
     const std::shared_ptr<
         const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd,
@@ -745,6 +658,7 @@ void vaddrelu_better(
 
 TEST(JitKernel, vaddrelu) {
   namespace jit = paddle::operators::math::jitkernel;
+  namespace refer = paddle::operators::math::jitkernel::refer;
   for (int d : {7, 8, 15, 16, 30, 256, 512}) {
     std::vector<float> x(d), y(d);
     std::vector<float> zref(d), ztgt(d);
@@ -762,7 +676,7 @@ TEST(JitKernel, vaddrelu) {
     float* zref_data = zref.data();
     auto trefs = GetCurrentUS();
     for (int i = 0; i < repeat; ++i) {
-      vaddrelu_ref(d, x_data, y_data, zref_data);
+      refer::VAddRelu<float>(x_data, y_data, zref_data, d);
     }
     auto trefe = GetCurrentUS();
     auto tmkls = GetCurrentUS();
@@ -775,10 +689,9 @@ TEST(JitKernel, vaddrelu) {
       ker->Compute(x_data, y_data, ztgt_data, d);
     }
     auto ttgte = GetCurrentUS();
-    VLOG(30) << "Vec size " << d
-             << ": refer takes: " << (trefe - trefs) / repeat
-             << " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
-             << "tgt takes: " << (ttgte - ttgts) / repeat;
+    VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
+            << " us, better takes: " << (tmkle - tmkls) / repeat << " us, "
+            << "tgt takes: " << (ttgte - ttgts) / repeat << " us";
     for (int i = 0; i < d; ++i) {
       EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
     }
@@ -789,21 +702,23 @@ TEST(JitKernel, pool) {
   namespace jit = paddle::operators::math::jitkernel;
   const int frame_size = 4;
   std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh";
+  jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false);
+
+  // empty call it to avoid unknown flag 'use_pinned_memory' on Mac
+  paddle::platform::jit::MayIUse(paddle::platform::jit::avx);
   const auto& plstm1 =
       jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const std::string&,
-                        const std::string&, const std::string&>(
-              act_gate, act_cand, act_cell, frame_size, false);
+          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
+
   const auto& plstm2 =
       jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const std::string&,
-                        const std::string&, const std::string&>(
-              act_gate, act_cand, act_cell, frame_size, false);
+          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
+  EXPECT_EQ(plstm1, plstm2);
+
   const auto& peephole =
       jit::KernelPool::Instance()
-          .template Get<jit::LSTMKernel<float>, const std::string&,
-                        const std::string&, const std::string&>(
-              act_gate, act_cand, act_cell, frame_size, true);
+          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(
+              jit::lstm_attr_t(frame_size, act_gate, act_cand, act_cell, true));
   EXPECT_TRUE(plstm1 != peephole);
 
   const auto& pvmul_f =
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index 1e56e29739..71b9293eed 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -19,16 +19,15 @@ namespace operators {
 namespace math {
 
 template <typename T>
-void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
-                                  const framework::Tensor& vec) {
-  SimpleCodeTable code_table(num_classes_);
+void MatrixBitCodeFunctor<T>::Add(const framework::Tensor& vec,
+                                  framework::Tensor* tmat) {
   size_t batch_size = tmat->dims()[0];
   size_t width = tmat->dims()[1];
   for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table_->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code.calc_index(j);
+      size_t index = code->calc_index(j);
       tmat->data<T>()[i * width + j] += vec.data<T>()[index];
     }
   }
@@ -37,31 +36,46 @@ void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
 template <typename T>
 void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
                                       framework::Tensor* vec) {
-  SimpleCodeTable code_table(num_classes_);
   size_t batch_size = tmat.dims()[0];
   size_t width = tmat.dims()[1];
   for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table_->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code.calc_index(j);
+      size_t index = code->calc_index(j);
       vec->data<T>()[index] += tmat.data<T>()[i * width + j];
     }
   }
 }
 
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
+                                      framework::SelectedRows* vec) {
+  size_t batch_size = tmat.dims()[0];
+  size_t width = tmat.dims()[1];
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto code = code_table_->get_code(i);
+    int code_length = code->get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code->calc_index(j);
+      int64_t row_index = vec->GetIndexFromId(static_cast<int64_t>(index));
+      vec->mutable_value()->data<T>()[row_index] +=
+          tmat.data<T>()[i * width + j];
+    }
+  }
+}
+
 template <typename T>
 void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
                                   framework::Tensor* sum, T scale_sum) {
-  SimpleCodeTable code_table(num_classes_);
   size_t num_samples = tmat.dims()[0];
   size_t o_width = tmat.dims()[1];
   for (size_t i = 0; i < num_samples; ++i) {
     T sm = static_cast<T>(0.0);
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table_->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      if (code.calc_bit(j)) {
+      if (code->calc_bit(j)) {
         // calc_bit starts from right most bit, while data in tmat[i] is in the
         // reverse order.
         sm += tmat.data<T>()[i * o_width + j];
@@ -75,7 +89,6 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
                                   const framework::Tensor& weight,
                                   const framework::Tensor& input) {
-  SimpleCodeTable code_table(num_classes_);
   size_t num_samples = tmat->dims()[0];
   size_t tmat_width = tmat->dims()[1];
   size_t input_width = input.dims()[1];
@@ -84,10 +97,10 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
   auto weight_value = weight.data<T>();
   auto input_value = input.data<T>();
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table_->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code.calc_index(j);
+      size_t index = code->calc_index(j);
       T sum = static_cast<T>(0.0);
       for (size_t k = 0; k < input_width; ++k) {
         sum += weight_value[weight_width * index + k] *
@@ -102,7 +115,6 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
                                             framework::Tensor* weight,
                                             const framework::Tensor& input) {
-  SimpleCodeTable code_table(num_classes_);
   size_t num_samples = tmat.dims()[0];
   size_t input_width = input.dims()[1];
   size_t tmat_width = tmat.dims()[1];
@@ -111,10 +123,10 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
   auto weight_value = weight->data<T>();
   auto input_value = input.data<T>();
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table_->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code.calc_index(j);
+      size_t index = code->calc_index(j);
 
       for (size_t k = 0; k < input_width; ++k) {
         weight_value[weight_width * index + k] +=
@@ -124,11 +136,35 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
   }
 }
 
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
+                                            framework::SelectedRows* weight,
+                                            const framework::Tensor& input) {
+  size_t num_samples = tmat.dims()[0];
+  size_t input_width = input.dims()[1];
+  size_t tmat_width = tmat.dims()[1];
+  size_t weight_width = weight->value().dims()[1];
+  auto tmat_value = tmat.data<T>();
+  auto weight_value = weight->mutable_value()->data<T>();
+  auto input_value = input.data<T>();
+  for (size_t i = 0; i < num_samples; ++i) {
+    auto code = code_table_->get_code(i);
+    int code_length = code->get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code->calc_index(j);
+      for (size_t k = 0; k < input_width; ++k) {
+        int64_t row_index = weight->GetIndexFromId(static_cast<int64_t>(index));
+        weight_value[row_index * weight_width + k] +=
+            tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
+      }
+    }
+  }
+}
+
 template <typename T>
 void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
                                            const framework::Tensor& weight,
                                            framework::Tensor* input) {
-  SimpleCodeTable code_table(num_classes_);
   size_t num_samples = tmat.dims()[0];
   size_t tmat_width = tmat.dims()[1];
   size_t input_width = input->dims()[1];
@@ -138,10 +174,10 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
   auto input_value = input->data<T>();
 
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table_->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code.calc_index(j);
+      size_t index = code->calc_index(j);
 
       for (size_t k = 0; k < input_width; ++k) {
         input_value[input_width * i + k] +=
@@ -154,14 +190,13 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
 
 template <typename T>
 void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
-  SimpleCodeTable code_table(num_classes_);
   size_t num_samples = tmat->dims()[0];
   size_t o_width = tmat->dims()[1];
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table_->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      if (code.calc_bit(j)) {
+      if (code->calc_bit(j)) {
         tmat->data<T>()[i * o_width + j] -= 1;
       }
     }
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index c329b8b611..c30bb52641 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -92,9 +94,27 @@ inline int clz(const T& value) {
 
 inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
+// set a code interface to create multiple code
+class Code {
+ public:
+  virtual ~Code() {}
+  virtual size_t calc_index(int bit) const = 0;
+  virtual bool calc_bit(int bit) const = 0;
+  virtual int get_length() const = 0;
+};
+// set a CodeTable interface to create multiple code table
+class CodeTable {
+ public:
+  virtual std::unique_ptr<Code> get_code(int64_t code) const = 0;
+  virtual size_t size() const = 0;
+  virtual int get_max_code_length() const = 0;
+  virtual ~CodeTable() {}
+};
 
-struct SimpleCode {
-  SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
+class SimpleCode : public Code {
+ public:
+  SimpleCode(size_t code, size_t num_classes, const int64_t* ids)
+      : c_(static_cast<size_t>(ids[code]) + num_classes) {}
   /**
    * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
    * is `c + num_classes` and all siblings can get the same weight indice using
@@ -104,41 +124,121 @@ struct SimpleCode {
    * Binary classification path is the suffixes of encoding, thus leave out the
    * left most bit in calc_bit.
    */
-  inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
-  inline bool calc_bit(int bit) const { return c_ & (1 << bit); }
-  inline int get_length() const { return FindLastSet(c_) - 1; }
+  size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
+  bool calc_bit(int bit) const { return c_ & (1 << bit); }
+  int get_length() const { return FindLastSet(c_) - 1; }
 
  private:
   size_t c_;
 };
 
-struct SimpleCodeTable {
-  explicit SimpleCodeTable(size_t num_classes) : num_classes_(num_classes) {}
-  SimpleCode operator()(size_t code) const {
-    return SimpleCode(code, num_classes_);
+template <typename T>
+class CustomCode : public Code {
+ public:
+  CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
+             const int64_t* ids, int index)
+      : ids_(ids), index_(index) {
+    ptable_ = ptable.Slice(index, index + 1);
+    pcode_ = pcode.Slice(index, index + 1);
+  }
+  /**
+   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * is `c + num_classes` and all siblings can get the same weight indice using
+   * prefixes.
+   * Weight index is the prefixes of encoding, thus leave out the right most
+   * bit in calc_index.
+   * Binary classification path is the suffixes of encoding, thus leave out the
+   * left most bit in calc_bit.
+   */
+  size_t calc_index(int bit) const { return ptable_.data<T>()[bit]; }
+  bool calc_bit(int bit) const { return pcode_.data<T>()[bit]; }
+  int get_length() const {
+    int length = 0;
+
+    for (int i = 0; i < static_cast<int>(ptable_.dims()[1]); i++) {
+      if (ptable_.data<T>()[i] >= 0) {
+        length++;
+      } else {
+        return length;
+      }
+    }
+    return length;
+  }
+
+ private:
+  framework::Tensor ptable_;
+  framework::Tensor pcode_;
+  const int64_t* ids_;
+  const int index_;
+};
+
+class SimpleCodeTable : public CodeTable {
+ public:
+  SimpleCodeTable(size_t num_classes, const int64_t* ids)
+      : num_classes_(num_classes), ids_(ids) {}
+  std::unique_ptr<Code> get_code(int64_t code) const {
+    std::unique_ptr<Code> coder(new SimpleCode(code, num_classes_, ids_));
+    return coder;
   }
   size_t size() const { return num_classes_; }
   int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
 
  private:
   size_t num_classes_;
+  const int64_t* ids_;
+};
+
+template <typename T>
+class CustomCodeTable : public CodeTable {
+ public:
+  CustomCodeTable(const framework::Tensor& ptable,
+                  const framework::Tensor& pcode, const int64_t* ids)
+      : ptable_(ptable), pcode_(pcode), ids_(ids) {}
+
+  std::unique_ptr<Code> get_code(int64_t code) const {
+    std::unique_ptr<Code> coder(new CustomCode<T>(ptable_, pcode_, ids_, code));
+    return coder;
+  }
+
+  size_t size() const { return static_cast<size_t>(ptable_.dims()[1]); }
+  int get_max_code_length() const {
+    return static_cast<size_t>(ptable_.dims()[1]);
+  }
+
+ private:
+  const framework::Tensor& ptable_;
+  const framework::Tensor& pcode_;
+  const int64_t* ids_;
 };
 
 template <typename T>
 class MatrixBitCodeFunctor {
  public:
-  explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
-      : num_classes_(num_classes), ids_(ids) {}
+  MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
+      : num_classes_(num_classes),
+        ids_(ids),
+        code_table_(new SimpleCodeTable(num_classes, ids)) {}
+
+  MatrixBitCodeFunctor(const framework::Tensor& ptable,
+                       const framework::Tensor& pcode, const int64_t* ids)
+      : num_classes_(static_cast<size_t>(ptable.dims()[1])),
+        ids_(ids),
+        code_table_(new CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
   /* For j < code_length
        tmat(i, j) += vec(0, index(i, j))
   */
-  void Add(framework::Tensor* tmat, const framework::Tensor& vec);
+  void Add(const framework::Tensor& vec, framework::Tensor* tmat);
 
   /* For j < code_length
        vec(0, index(i, j)) += tmat(i, j)
   */
   void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec);
 
+  /* For selected rows For j < code_length
+       vec(0, index(i, j)) += tmat(i, j)
+  */
+  void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec);
+
   /* For j < code_length
     sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
   */
@@ -159,6 +259,12 @@ class MatrixBitCodeFunctor {
   */
   void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight,
                      const framework::Tensor& input);
+  /* For SelectedRows Weight, For index(i, j) >= 0:
+      weight.row(index(i, j)) += tmat(i, j) * input.row(i)
+  */
+  void MulGradWeight(const framework::Tensor& tmat,
+                     framework::SelectedRows* weight,
+                     const framework::Tensor& input);
   /* For j < code_length
     input.row(i) += tmat(i, j) * weight.row(index(i, j))
   */
@@ -167,6 +273,7 @@ class MatrixBitCodeFunctor {
 
   size_t num_classes_;
   const int64_t* ids_;
+  std::unique_ptr<CodeTable> code_table_;
 };
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 690d6f6baa..2708f3bcd8 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -60,75 +60,30 @@ float LogUniformSampler::Probability(int64_t value) const {
   return (log((value + 2.0) / (value + 1.0))) / log_range_;
 }
 
-CustomSampler::CustomSampler(int64_t range, const float* probabilities,
+CustomSampler::CustomSampler(int64_t range, const float *probabilities,
+                             const int *alias, const float *alias_probabilities,
                              unsigned int seed)
     : Sampler(range, seed) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
+  random_engine_ = std::make_shared<std::mt19937>(seed_);
   real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
   int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
-  alias_probs_ = std::make_shared<std::vector<float>>(range + 1);
-  alias_ = std::make_shared<std::vector<int64_t>>(range + 1);
-  probs_ = std::make_shared<std::vector<float>>(range + 1);
-
-  std::queue<std::pair<int64_t, float>> bigs;
-  std::queue<std::pair<int64_t, float>> littles;
-  for (int64_t i = 0; i <= range; ++i) {
-    (*probs_)[i] = probabilities[i];
-    float normal_prob = probabilities[i] * (range + 1);
-    if (normal_prob - 1.0 > 1e-4) {
-      bigs.emplace(i, normal_prob);
-    } else if (1.0 - normal_prob > 1e-4) {
-      littles.emplace(i, normal_prob);
-    } else {
-      (*alias_probs_)[i] = normal_prob;
-      (*alias_)[i] = -1;
-    }
-  }
-
-  while ((!littles.empty()) && (!bigs.empty())) {
-    auto big = bigs.front();
-    auto little = littles.front();
-    bigs.pop();
-    littles.pop();
-    (*alias_probs_)[little.first] = little.second;
-    (*alias_)[little.first] = big.first;
-    auto big_left = big.second - (1 - little.second);
-    if (big_left - 1.0 > 1e-4) {
-      bigs.emplace(big.first, big_left);
-    } else if (1.0 - big_left > 1e-4) {
-      littles.emplace(big.first, big_left);
-    } else {
-      (*alias_probs_)[big.first] = big_left;
-      (*alias_)[big.first] = -1;
-    }
-  }
 
-  if (!littles.empty()) {  // littles.second is close to 1.0
-    auto little = littles.front();
-    (*alias_probs_)[little.first] = 1.0;
-    (*alias_)[little.first] = -1;
-  }
-
-  if (!bigs.empty()) {  // bigs.second is close to 1.0
-    auto big = bigs.front();
-    (*alias_probs_)[big.first] = 1.0;
-    (*alias_)[big.first] = -1;
-  }
+  alias_probs_ = alias_probabilities;
+  probs_ = probabilities;
+  alias_ = alias;
 }
 
 int64_t CustomSampler::Sample() const {
   auto index = (*int_dist_)(*random_engine_);
   auto p = (*real_dist_)(*random_engine_);
-  if (p > (*alias_probs_)[index]) {
-    return (*alias_)[index];
+  if (p > alias_probs_[index]) {
+    return alias_[index];
   } else {
     return index;
   }
 }
 
-float CustomSampler::Probability(int64_t value) const {
-  return (*probs_)[value];
-}
+float CustomSampler::Probability(int64_t value) const { return probs_[value]; }
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 836cdad51f..98e0b898a5 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <cstdint>
 #include <memory>
 #include <random>
@@ -38,9 +39,12 @@ class Sampler {
       seed_ = seed;
     }
   }
+
   virtual ~Sampler();
+
   // Sample a single value
   virtual int64_t Sample() const = 0;
+
   // The probability that a single call to Sample() returns the given value.
   virtual float Probability(int64_t value) const = 0;
 
@@ -99,6 +103,7 @@ class LogUniformSampler : public Sampler {
 class CustomSampler : public Sampler {
  public:
   explicit CustomSampler(int64_t range, const float* probabilities,
+                         const int* alias, const float* alias_probabilities,
                          unsigned int seed = 0UL);
 
   ~CustomSampler() override {}
@@ -108,10 +113,10 @@ class CustomSampler : public Sampler {
   float Probability(int64_t value) const override;
 
  private:
-  std::shared_ptr<std::vector<float>> alias_probs_;
-  std::shared_ptr<std::vector<int64_t>> alias_;
-  std::shared_ptr<std::vector<float>> probs_;
-  std::shared_ptr<std::mt19937_64> random_engine_;
+  const float* alias_probs_;
+  const int* alias_;
+  const float* probs_;
+  std::shared_ptr<std::mt19937> random_engine_;
   std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
   std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
 };
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 5978c1d605..3eba268cfa 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -270,7 +270,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
                   const std::vector<const framework::SelectedRows*>& inputs,
                   framework::SelectedRows* output) {
     if (inputs.size() == 0) {
-      VLOG(30) << "no input! return";
+      VLOG(3) << "no input! return";
       return;
     }
     const framework::SelectedRows* has_value_input = nullptr;
@@ -281,7 +281,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
       }
     }
     if (has_value_input == nullptr) {
-      VLOG(30) << "no input has value! just return" << std::endl;
+      VLOG(3) << "no input has value! just return" << std::endl;
       return;
     }
     auto input_width = has_value_input->value().dims()[1];
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 74b9659cfd..c4fccdbf86 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -314,7 +314,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
                   const std::vector<const framework::SelectedRows*>& inputs,
                   framework::SelectedRows* output) {
     if (inputs.size() == 0) {
-      VLOG(30) << "no input! return";
+      VLOG(3) << "no input! return";
       return;
     }
     const framework::SelectedRows* has_value_input = nullptr;
@@ -325,7 +325,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
       }
     }
     if (has_value_input == nullptr) {
-      VLOG(30) << "no input has value! just return" << std::endl;
+      VLOG(3) << "no input has value! just return" << std::endl;
       return;
     }
     auto input_width = has_value_input->value().dims()[1];
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 0015fafbc8..51da6de26e 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -16,13 +16,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX __FLT_MAX__
-
 template <typename T>
 struct MaxPoolFunctor {
   HOSTDEVICE void operator()(const T* input, const size_t start,
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 0f3e5b2008..31ed519666 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -100,11 +100,8 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
 
     blas.VEXP(num_classes * batch_size, out_data, out_data);
     for (int n = 0; n < batch_size; ++n) {
-      entities[n] = out_data[n * num_classes];
-      for (int c = 1; c < num_classes; ++c) {
-        entities[n] += out_data[n * num_classes + c];
-      }
-      blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]);
+      auto sum = blas.ASUM(num_classes, &out_data[n * num_classes], 1);
+      blas.SCAL(num_classes, 1.0f / sum, &out_data[n * num_classes]);
     }
   }
 };
diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h
index fb370842d1..4ab5cfe53c 100644
--- a/paddle/fluid/operators/metrics/auc_op.h
+++ b/paddle/fluid/operators/metrics/auc_op.h
@@ -75,8 +75,13 @@ class AucKernel : public framework::OpKernel<T> {
     const auto *label_data = label->data<int64_t>();
 
     for (size_t i = 0; i < batch_size; i++) {
-      uint32_t binIdx = static_cast<uint32_t>(
-          inference_data[i * inference_width + 1] * num_thresholds);
+      auto predict_data = inference_data[i * inference_width + 1];
+      PADDLE_ENFORCE_LE(predict_data, 1,
+                        "The predict data must less or equal 1.");
+      PADDLE_ENFORCE_GE(predict_data, 0,
+                        "The predict data must gather or equal 0.");
+
+      uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
       if (label_data[i]) {
         (*stat_pos)[binIdx] += 1.0;
       } else {
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 7e434c293c..8a111e6065 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -38,9 +38,9 @@ class MulOp : public framework::OperatorWithKernel {
     int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
     int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
 
-    VLOG(30) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
-             << " x_num_col_dims=" << x_num_col_dims
-             << " y_num_col_dims=" << y_num_col_dims;
+    VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
+            << " x_num_col_dims=" << x_num_col_dims
+            << " y_num_col_dims=" << y_num_col_dims;
 
     PADDLE_ENFORCE_GT(
         x_dims.size(), x_num_col_dims,
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index 9db0031a69..8de974bc2b 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -63,16 +63,16 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     // device id
     int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
     int idx = comm->GetCommId(gpu_id);
-    VLOG(30) << "gpu : "
-             << " invoke allreduce. send " << x->numel() << " recv "
-             << out->numel();
+    VLOG(3) << "gpu : "
+            << " invoke allreduce. send " << x->numel() << " recv "
+            << out->numel();
     PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
         x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
         NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
         ctx.cuda_device_context().stream()));
-    VLOG(30) << "gpu : "
-             << " finished allreduce. send " << x->numel() << " recv "
-             << out->numel();
+    VLOG(3) << "gpu : "
+            << " finished allreduce. send " << x->numel() << " recv "
+            << out->numel();
   }
 };
 
@@ -109,14 +109,14 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     } else {
       out->Resize(framework::make_ddim({0}));
     }
-    VLOG(30) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
-             << " recv " << out->numel();
+    VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
+            << " recv " << out->numel();
     PADDLE_ENFORCE(platform::dynload::ncclReduce(
         x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
         reduction_op_, root, comm->comms().at(idx),
         ctx.cuda_device_context().stream()));
-    VLOG(30) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
-             << " recv " << out->numel();
+    VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
+            << " recv " << out->numel();
   }
 };
 
@@ -133,22 +133,21 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     int idx = comm->GetCommId(gpu_id);
     if (idx == root) {
       auto* x = ctx.Input<LoDTensor>("X");
-      VLOG(30) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
+      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
       PADDLE_ENFORCE(platform::dynload::ncclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
           NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
           ctx.cuda_device_context().stream()));
-      VLOG(30) << "gpu : " << gpu_id << " finished Bcast.";
+      VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
     } else {
       auto* out = ctx.Output<LoDTensor>("Out");
-      VLOG(30) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
-               << framework::product(out->dims());
+      VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
+              << framework::product(out->dims());
       PADDLE_ENFORCE(platform::dynload::ncclBcast(
           out->mutable_data<T>(ctx.GetPlace()), out->numel(),
           NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
           ctx.cuda_device_context().stream()));
-      VLOG(30) << "gpu : " << gpu_id << " finished Bcast. recv "
-               << out->numel();
+      VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel();
     }
   }
 };
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index f48ccdd97f..d5fb7a12e5 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -86,9 +86,9 @@ class NCCLTester : public ::testing::Test {
     (*p_scopes).resize(gpu_list_.size());
 
     auto op = f::OpRegistry::CreateOp(*op1);
-    VLOG(10) << "invoke NCCLInitOp.";
+    VLOG(1) << "invoke NCCLInitOp.";
     op->Run(g_scope_, cpu_place);
-    VLOG(10) << "NCCLInitOp finished.";
+    VLOG(1) << "NCCLInitOp finished.";
   }
 
   int GetGPUData(int gpu_id) { return gpu_id + 42; }
@@ -109,7 +109,7 @@ class NCCLTester : public ::testing::Test {
 
       std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
       paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
-      VLOG(10) << "Send Tensor filled with elements " << send_tensor->numel();
+      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
     }
 
     lk.unlock();
@@ -119,11 +119,11 @@ class NCCLTester : public ::testing::Test {
 
     auto op = f::OpRegistry::CreateOp(*op1);
 
-    VLOG(10) << "Device : " << gpu_id << " invoke " << op_desc.Type();
-    VLOG(10) << " send_tensor : " << send_tensor->numel()
-             << " recv_tensor : " << recv_tensor->numel();
+    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
+    VLOG(1) << " send_tensor : " << send_tensor->numel()
+            << " recv_tensor : " << recv_tensor->numel();
     op->Run(*scope, place);
-    VLOG(10) << "Device : " << gpu_id << " finished " << op_desc.Type();
+    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
   }
 
  public:
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 9b0d45ae5b..9f97f7821d 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/nce_op.h"
 
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -25,7 +26,7 @@ class NCEOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Input"));
     PADDLE_ENFORCE(ctx->HasInput("Label"));
     PADDLE_ENFORCE(ctx->HasInput("Weight"));
@@ -67,7 +68,7 @@ class NCEOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
         platform::CPUPlace());
@@ -101,11 +102,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
 
     AddInput(
-        "CustomDistribution",
+        "CustomDistProbs",
         "(Tensor) It is used in 'CostumDist' sampler. "
         "It is a tensor with shape [num_total_classes]."
         "The i-th element is the probsbility of the i-th class being sampled.")
         .AsDispensable();
+    AddInput(
+        "CustomDistAlias",
+        "(Tensor) It is used in 'CostumDist' sampler. "
+        "It is a tensor with shape [num_total_classes]."
+        "The i-th element is the probsbility of the i-th class being sampled.")
+        .AsDispensable();
+    AddInput(
+        "CustomDistAliasProbs",
+        "(Tensor) It is used in 'CostumDist' sampler. "
+        "It is a tensor with shape [num_total_classes]."
+        "The i-th element is the probsbility of the i-th class being sampled.")
+        .AsDispensable();
+
     AddOutput("Cost",
               "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
     AddOutput("SampleLogits",
@@ -124,21 +138,22 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
               "kernel to compute grads."
               "")
         .AsIntermediate();
+
     AddAttr<int>("num_total_classes",
                  "Total number of classes in all samples.");
     AddAttr<int>("num_neg_samples",
                  "The number of negative classes. The default value is 10.")
         .SetDefault(10);
-
     AddAttr<int>("sampler",
                  "(int) Which sampler to be used to sample negative class."
                  "0: Uniform; 1: LogUniform; 2: CostumDist.")
         .SetDefault(0);
-
     AddAttr<int>("seed",
                  "(int) The seed used in sampler. If it is 0, "
                  "the sampler will generate a seed randomly.")
         .SetDefault(0);
+    AddAttr<bool>("is_sparse", "(boolean, default false) Sparse update.")
+        .SetDefault(false);
 
     AddAttr<std::vector<int>>("custom_neg_classes",
                               "This attribute only be used in unitest. Classes "
@@ -147,20 +162,28 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
                               "user should avoid setting this attribute.")
         .SetDefault({});
     AddComment(R"DOC(
-Compute and return the noise-contrastive estimation training loss. See 
-`Noise-contrastive estimation: A new estimation principle for unnormalized 
-statistical models 
+Compute and return the noise-contrastive estimation training loss. See
+`Noise-contrastive estimation: A new estimation principle for unnormalized
+statistical models
  <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
 By default this operator uses a uniform distribution for sampling.
 )DOC");
   }
 };
 
+class NCEOpGradDescMaker : public framework::DefaultGradOpDescMaker<true> {
+  using ::paddle::framework::DefaultGradOpDescMaker<
+      true>::DefaultGradOpDescMaker;
+
+ protected:
+  virtual std::string GradOpType() const { return "nce_grad"; }
+};
+
 class NCEOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Input"));
     PADDLE_ENFORCE(ctx->HasInput("Weight"));
     PADDLE_ENFORCE(ctx->HasInput("Cost"));
@@ -190,20 +213,45 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
         platform::CPUPlace());
   }
 };
 
+class NCEOpGradVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front();
+    auto bias_grad = op_desc.Output(framework::GradVarName("Bias")).front();
+
+    auto attr = op_desc.GetAttr("is_sparse");
+    bool is_sparse = boost::get<bool>(attr);
+    if (is_sparse) {
+      VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad
+              << " is set to SelectedRows";
+      block->Var(weight_grad)
+          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      block->Var(bias_grad)->SetType(framework::proto::VarType::SELECTED_ROWS);
+    } else {
+      VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad
+              << " is set to LoDTensor";
+      block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
+      block->Var(bias_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
+    }
+    block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType());
+    block->Var(bias_grad)->SetDataType(block->Var("Input")->GetDataType());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad);
+REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpGradDescMaker, ops::NCEOpMaker);
+REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad, ops::NCEOpGradVarTypeInference);
 REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
                        ops::NCEKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(nce_grad,
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index e9af8ad4ce..f2ca6ec247 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -16,26 +16,32 @@ limitations under the License. */
 
 #include <math.h>
 #include <random>
+#include <set>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/sampler.h"
 #include "unsupported/Eigen/CXX11/Tensor"
+
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
 using Sampler = math::Sampler;
+using DDim = framework::DDim;
 
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
-void PrepareSamples(const framework::ExecutionContext& context,
-                    Sampler* sampler) {
+void PrepareSamples(const framework::ExecutionContext &context,
+                    Sampler *sampler) {
   auto label = context.Input<Tensor>("Label");
-  const int64_t* label_data = label->data<int64_t>();
+  const int64_t *label_data = label->data<int64_t>();
   auto label_dims = label->dims();
   //  int num_total_classes = context.Attr<int>("num_total_classes");
   // for unitest
@@ -44,7 +50,7 @@ void PrepareSamples(const framework::ExecutionContext& context,
 
   auto sample_labels = context.Output<Tensor>("SampleLabels");
   auto sample_labels_dims = sample_labels->dims();
-  int64_t* sample_labels_data =
+  int64_t *sample_labels_data =
       sample_labels->mutable_data<int64_t>(context.GetPlace());
 
   int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
@@ -70,13 +76,13 @@ void PrepareSamples(const framework::ExecutionContext& context,
 template <typename DeviceContext, typename T>
 class NCEKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
     int sampler_type = context.Attr<int>("sampler");
     int seed = context.Attr<int>("seed");
     int num_total_classes = context.Attr<int>("num_total_classes");
     int num_neg_samples = context.Attr<int>("num_neg_samples");
 
-    Sampler* sampler;
+    Sampler *sampler;
     switch (sampler_type) {
       case 0: {
         sampler = new math::UniformSampler(num_total_classes - 1, seed);
@@ -87,11 +93,19 @@ class NCEKernel : public framework::OpKernel<T> {
         break;
       }
       case 2: {
-        auto custom_dist = context.Input<Tensor>("CustomDistribution");
-        const float* custom_dist_data = custom_dist->data<float>();
-        PADDLE_ENFORCE_EQ(custom_dist->numel(), num_total_classes);
-        sampler = new math::CustomSampler(num_total_classes - 1,
-                                          custom_dist_data, seed);
+        auto dist_probs = context.Input<Tensor>("CustomDistProbs");
+        auto dist_alias = context.Input<Tensor>("CustomDistAlias");
+        auto dist_alias_probs = context.Input<Tensor>("CustomDistAliasProbs");
+
+        PADDLE_ENFORCE_EQ(dist_probs->numel(), num_total_classes);
+        PADDLE_ENFORCE_EQ(dist_alias->numel(), num_total_classes);
+        PADDLE_ENFORCE_EQ(dist_alias_probs->numel(), num_total_classes);
+
+        const float *probs_data = dist_probs->data<float>();
+        const int *alias_data = dist_alias->data<int>();
+        const float *alias_probs_data = dist_alias_probs->data<float>();
+        sampler = new math::CustomSampler(num_total_classes - 1, probs_data,
+                                          alias_data, alias_probs_data, seed);
         break;
       }
       default: { PADDLE_THROW("Unsupported SamplerType."); }
@@ -99,17 +113,17 @@ class NCEKernel : public framework::OpKernel<T> {
 
     PrepareSamples<DeviceContext, T>(context, sampler);
     auto sample_labels = context.Output<Tensor>("SampleLabels");
-    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    const int64_t *sample_labels_data = sample_labels->data<int64_t>();
     auto sample_out = context.Output<Tensor>("SampleLogits");
-    T* sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
+    T *sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
     auto label = context.Input<Tensor>("Label");
     auto sample_weight = context.Input<Tensor>("SampleWeight");
-    const T* sample_weight_data = nullptr;
+    const T *sample_weight_data = nullptr;
     if (sample_weight != nullptr) {
       sample_weight_data = sample_weight->data<T>();
     }
     auto out = context.Output<Tensor>("Cost");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
+    T *out_data = out->mutable_data<T>(context.GetPlace());
     int64_t num_true_class = 1;
     if (label != nullptr) {
       num_true_class = label->dims()[1];
@@ -119,7 +133,7 @@ class NCEKernel : public framework::OpKernel<T> {
     // forward bias
     auto bias = context.Input<Tensor>("Bias");
     if (bias != nullptr) {
-      const T* bias_data = bias->data<T>();
+      const T *bias_data = bias->data<T>();
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         sample_out_data[i] = bias_data[sample_labels_data[i]];
       }
@@ -158,16 +172,16 @@ class NCEKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class NCEGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
     auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
-    const T* d_out_data = d_out->data<T>();
+    const T *d_out_data = d_out->data<T>();
     auto label = context.Input<Tensor>("Label");
     auto sample_out = context.Input<Tensor>("SampleLogits");
-    const T* sample_out_data = sample_out->data<T>();
+    const T *sample_out_data = sample_out->data<T>();
     auto sample_labels = context.Input<Tensor>("SampleLabels");
-    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    const int64_t *sample_labels_data = sample_labels->data<int64_t>();
     auto sample_weight = context.Input<Tensor>("SampleWeight");
-    const T* sample_weight_data = nullptr;
+    const T *sample_weight_data = nullptr;
     if (sample_weight != nullptr) {
       sample_weight_data = sample_weight->data<T>();
     }
@@ -180,7 +194,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
 
     int sampler_type = context.Attr<int>("sampler");
     int seed = context.Attr<int>("seed");
-    Sampler* sampler;
+    Sampler *sampler;
     switch (sampler_type) {
       case 0: {
         sampler = new math::UniformSampler(num_total_classes - 1, seed);
@@ -191,11 +205,19 @@ class NCEGradKernel : public framework::OpKernel<T> {
         break;
       }
       case 2: {
-        auto custom_dist = context.Input<Tensor>("CustomDistribution");
-        const float* custom_dist_data = custom_dist->data<float>();
-        PADDLE_ENFORCE_EQ(custom_dist->numel(), num_total_classes);
-        sampler = new math::CustomSampler(num_total_classes - 1,
-                                          custom_dist_data, seed);
+        auto dist_probs = context.Input<Tensor>("CustomDistProbs");
+        auto dist_alias = context.Input<Tensor>("CustomDistAlias");
+        auto dist_alias_probs = context.Input<Tensor>("CustomDistAliasProbs");
+
+        PADDLE_ENFORCE_EQ(dist_probs->numel(), num_total_classes);
+        PADDLE_ENFORCE_EQ(dist_alias->numel(), num_total_classes);
+        PADDLE_ENFORCE_EQ(dist_alias_probs->numel(), num_total_classes);
+
+        const float *probs_data = dist_probs->data<float>();
+        const int *alias_data = dist_alias->data<int>();
+        const float *alias_probs_data = dist_alias_probs->data<float>();
+        sampler = new math::CustomSampler(num_total_classes - 1, probs_data,
+                                          alias_data, alias_probs_data, seed);
         break;
       }
       default: { PADDLE_THROW("Unsupported SamplerType."); }
@@ -203,7 +225,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
 
     //    T b = 1. / num_total_classes * num_neg_samples;
     Tensor sample_grad;  // tmp tensor
-    T* sample_grad_data =
+    T *sample_grad_data =
         sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
     // backward cost
     for (int64_t i = 0; i < sample_labels->numel(); ++i) {
@@ -217,32 +239,105 @@ class NCEGradKernel : public framework::OpKernel<T> {
                                 : w * (o * (1 - o) / (o + b));
       sample_grad_data[i] *= d_out_data[sample_idx];
     }
-    // get d_bias
-    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
-    if (d_bias != nullptr) {
-      T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
-      std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
+
+    bool is_sparse = context.Attr<bool>("is_sparse");
+
+    if (!is_sparse) {
+      // get d_bias
+      auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
+      if (d_bias != nullptr) {
+        T *d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
+        std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
+        for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+          d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
+        }
+      }
+      // get d_w
+      auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
+      if (d_w != nullptr) {
+        auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
+        std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
+        auto d_w_matrix = EigenMatrix<T>::From(*d_w);
+        auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+        for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+          d_w_matrix.chip(sample_labels_data[i], 0) +=
+              x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
+              sample_grad_data[i];
+        }
+      }
+    } else {
+      std::vector<int64_t> labels;
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
+        labels.push_back(sample_labels_data[i]);
       }
-    }
-    // get d_w
-    auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
-    if (d_w != nullptr) {
-      auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
-      std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
-      auto d_w_matrix = EigenMatrix<T>::From(*d_w);
+      std::set<T> st(labels.begin(), labels.end());
+      labels.assign(st.begin(), st.end());
+
+      auto *bias_var = context.InputVar("Bias");
+      DDim bias_dim;
+      if (bias_var->IsType<LoDTensor>()) {
+        bias_dim = context.Input<LoDTensor>("Bias")->dims();
+      } else if (bias_var->IsType<SelectedRows>()) {
+        auto *table_t = context.Input<SelectedRows>("Bias");
+        bias_dim = table_t->value().dims();
+      } else {
+        PADDLE_THROW(
+            "The parameter Bias of a NCE_OP "
+            "must be either LoDTensor or SelectedRows");
+      }
+
+      auto d_bias =
+          context.Output<SelectedRows>(framework::GradVarName("Bias"));
+      d_bias->set_rows(labels);
+      d_bias->set_height(bias_dim[0]);
+
+      d_bias->mutable_value()->Resize(
+          {static_cast<int64_t>(labels.size()), bias_dim[1]});
+      T *d_bias_data =
+          d_bias->mutable_value()->mutable_data<T>(context.GetPlace());
+      std::fill(d_bias_data, d_bias_data + labels.size(), 0.0);
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_bias_data[d_bias->Index(sample_labels_data[i])] +=
+            sample_grad_data[i];
+      }
+
+      auto *table_var = context.InputVar("Weight");
+      DDim table_dim;
+      if (table_var->IsType<LoDTensor>()) {
+        table_dim = context.Input<LoDTensor>("Weight")->dims();
+      } else if (table_var->IsType<SelectedRows>()) {
+        auto *table_t = context.Input<SelectedRows>("Weight");
+        table_dim = table_t->value().dims();
+      } else {
+        PADDLE_THROW(
+            "The parameter Weight of a NCE_OP "
+            "must be either LoDTensor or SelectedRows");
+      }
+
+      auto d_w = context.Output<SelectedRows>(framework::GradVarName("Weight"));
+
+      d_w->set_rows(labels);
+      d_w->set_height(table_dim[0]);
+
+      auto *d_table_value = d_w->mutable_value();
+      d_table_value->Resize(
+          {static_cast<int64_t>(labels.size()), table_dim[1]});
+      auto d_w_data = d_table_value->mutable_data<T>(context.GetPlace());
+      std::fill(d_w_data, d_w_data + d_table_value->numel(), 0.0);
+
+      auto d_w_matrix = EigenMatrix<T>::From(*d_table_value);
       auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_w_matrix.chip(sample_labels_data[i], 0) +=
+        d_w_matrix.chip(d_w->Index(sample_labels_data[i]), 0) +=
             x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
             sample_grad_data[i];
       }
     }
+
     // get d_x
     auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
     if (d_x != nullptr) {
-      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
+      auto *d_x_data = d_x->mutable_data<T>(context.GetPlace());
       std::fill(d_x_data, d_x_data + d_x->numel(), 0.0);
       auto d_x_matrix = EigenMatrix<T>::From(*d_x);
       auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
@@ -251,6 +346,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
             w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
       }
     }
+
     delete sampler;
   }
 };
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 48e0448d09..3455d1ee54 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -297,7 +297,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
       auto& grad =
           Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
       if (grad.rows().size() == 0) {
-        VLOG(30) << "grad row size is 0!!";
+        VLOG(3) << "grad row size is 0!!";
         return;
       }
 
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index e5b756b4fa..71f079e4d9 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -346,7 +346,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
 
       // sparse update maybe empty.
       if (grad->rows().size() == 0) {
-        VLOG(30) << "Grad SelectedRows contains no data!";
+        VLOG(3) << "Grad SelectedRows contains no data!";
         return;
       }
       auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index b27ef27e29..98bae5e1d3 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -98,10 +98,10 @@ class SGDOpKernel : public framework::OpKernel<T> {
 
       auto param_row_width = param.value().dims()[1];
       auto grad_row_width = grad.value().dims()[1];
-      VLOG(40) << " param rows: " << param.rows().size()
-               << " param memory rows: " << param.value().dims()[0]
-               << " grad rows: " << grad.rows().size()
-               << " grad memory rows: " << grad.value().dims()[0];
+      VLOG(4) << " param rows: " << param.rows().size()
+              << " param memory rows: " << param.value().dims()[0]
+              << " grad rows: " << grad.rows().size()
+              << " grad memory rows: " << grad.value().dims()[0];
       PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
                         "param_row should have the same size with grad_row");
 
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index a706d05fd7..a9da21f479 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -319,20 +319,46 @@ void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
   }
 }
 
+static inline void GetPaddings(int* paddings,
+                               const framework::ExecutionContext& context) {
+  auto* paddings_t = context.Input<Tensor>("Paddings");
+  if (paddings_t) {
+    auto paddings_data = paddings_t->data<int>();
+    paddings[0] = paddings_data[0];
+    paddings[1] = paddings_data[1];
+    paddings[2] = paddings_data[2];
+    paddings[3] = paddings_data[3];
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings);
+  }
+}
+
 template <typename T>
 class Pad2dCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     T value = context.Attr<T>("pad_value");
+
     auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
     auto in_dims = x->dims();
-    auto out_dims = out->dims();
     const T* in_data = x->data<T>();
+
+    auto* out = context.Output<Tensor>("Out");
+    if (data_format == "NCHW") {
+      out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[0] + pads[1],
+                   in_dims[3] + pads[2] + pads[3]});
+    } else {
+      out->Resize({in_dims[0], in_dims[1] + pads[0] + pads[1],
+                   in_dims[2] + pads[2] + pads[3], in_dims[3]});
+    }
+    auto out_dims = out->dims();
     T* out_data = out->mutable_data<T>(context.GetPlace());
+
     const int pad_top = pads[0];
     const int pad_left = pads[2];
     const int num = in_dims[0];
@@ -376,7 +402,8 @@ template <typename T>
 class Pad2dGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
@@ -442,21 +469,35 @@ class Pad2dOp : public framework::OperatorWithKernel {
                    "Output(Out) of Pad2dOp should not be null.");
 
     auto x_dim = ctx->GetInputDim("X");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
     PADDLE_ENFORCE_EQ(x_dim.size(), 4,
-                      "Size of paddings should be equal to 4.");
-    std::vector<int64_t> out_dims(x_dim.size());
+                      "The size of input(X)'s dimension should be equal to 4.");
 
+    std::vector<int64_t> out_dims(x_dim.size());
     auto data_format = ctx->Attrs().Get<std::string>("data_format");
     out_dims[0] = x_dim[0];
-    if (data_format == "NCHW") {
+    if (ctx->HasInput("Paddings")) {
+      auto paddings_dim = ctx->GetInputDim("Paddings");
+      PADDLE_ENFORCE_EQ(
+          paddings_dim.size(), 1,
+          "Size of Input(Paddings)'s dimension should be equal to 1.");
+      PADDLE_ENFORCE_EQ(paddings_dim[0], 4,
+                        "Shape of Input(Paddings) should be equal to [4].");
       out_dims[1] = x_dim[1];
-      out_dims[2] = x_dim[2] + paddings[0] + paddings[1];  // height
-      out_dims[3] = x_dim[3] + paddings[2] + paddings[3];  // width
-    } else {                                               // NHWC
+      out_dims[2] = x_dim[2];
       out_dims[3] = x_dim[3];
-      out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
-      out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
+    } else {
+      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+      PADDLE_ENFORCE_EQ(paddings.size(), 4,
+                        "Size of paddings should be equal to 4.");
+      if (data_format == "NCHW") {
+        out_dims[1] = x_dim[1];
+        out_dims[2] = x_dim[2] + paddings[0] + paddings[1];  // height
+        out_dims[3] = x_dim[3] + paddings[2] + paddings[3];  // width
+      } else {                                               // NHWC
+        out_dims[3] = x_dim[3];
+        out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
+        out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
+      }
     }
 
     ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
@@ -466,6 +507,13 @@ class Pad2dOp : public framework::OperatorWithKernel {
       ctx->ShareLoD("X", /*->*/ "Out");
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -477,6 +525,12 @@ class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "The output of pad2d op. "
               "A tensor with the same shape as X.");
+    AddInput("Paddings",
+             "A 1-D tensor to describe the padding rules."
+             "paddings=[0, 1, 2, 3] means "
+             "padding 0 row to top, 1 row to bottom, 2 columns to left "
+             "and 3 columns to right. Size of paddings must be 4.")
+        .AsDispensable();
     AddAttr<std::vector<int>>(
         "paddings",
         "(vector<int>) "
@@ -554,6 +608,13 @@ class Pad2dOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
@@ -564,6 +625,7 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto* bind = new framework::OpDesc();
     bind->SetInput("X", Input("X"));
+    bind->SetInput("Paddings", Input("Paddings"));
     bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     bind->SetAttrMap(Attrs());
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index 9ba0ddbd84..72eca08b06 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -287,20 +287,50 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
   }
 }
 
+static inline void GetPaddings(int* paddings,
+                               const framework::ExecutionContext& context) {
+  auto* paddings_t = context.Input<Tensor>("Paddings");
+  if (paddings_t) {
+    Tensor pads;
+    framework::TensorCopySync(*paddings_t, platform::CPUPlace(), &pads);
+    auto pads_data = pads.data<int>();
+    paddings[0] = pads_data[0];
+    paddings[1] = pads_data[1];
+    paddings[2] = pads_data[2];
+    paddings[3] = pads_data[3];
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings);
+  }
+}
+
 template <typename T>
 class Pad2dCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     T value = context.Attr<T>("pad_value");
+
     auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
     auto in_dims = x->dims();
-    auto out_dims = out->dims();
     const T* in_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto* out = context.Output<Tensor>("Out");
+    auto out_dims = out->dims();
+    if (data_format == "NCHW") {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1];
+      out_dims[2] = in_dims[2] + pads[0] + pads[1];
+      out_dims[3] = in_dims[3] + pads[2] + pads[3];
+    } else {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1] + pads[0] + pads[1];
+      out_dims[2] = in_dims[2] + pads[2] + pads[3];
+      out_dims[3] = in_dims[3];
+    }
+    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
     const int pad_top = pads[0];
     const int pad_left = pads[2];
     const int num = in_dims[0];
@@ -356,7 +386,8 @@ template <typename T>
 class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index 5f1a48b6de..d68ba9d661 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -155,8 +155,8 @@ class RandomCropKernel : public framework::OpKernel<T> {
         seed = *cpu_seed.data<int64_t>();
       }
     } else {
-      VLOG(50) << "WARNING: The input 'Seed' is not initialized, use attribute "
-                  "'startup_seed' instead.";
+      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
+                 "'startup_seed' instead.";
       seed = ctx.Attr<int>("startup_seed");
     }
     auto shape = ctx.Attr<std::vector<int>>("shape");
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 6c919ee178..7c284312df 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -28,6 +28,12 @@ reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
 reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
 
+if (NOT WIN32 AND NOT ON_INFER)
+    cc_library(ctr_reader SRCS ctr_reader.cc DEPS gzstream reader zlib)
+    cc_test(ctr_reader_test SRCS ctr_reader_test.cc DEPS ctr_reader)
+    reader_library(create_ctr_reader_op SRCS create_ctr_reader_op.cc DEPS ctr_reader)
+endif ()
+
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
 # set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 618248f872..51b980acb5 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -42,7 +42,7 @@ class BlockingQueue {
     std::unique_lock<std::mutex> lock(mutex_);
     send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
     if (closed_) {
-      VLOG(50)
+      VLOG(5)
           << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
       return false;
     }
@@ -56,7 +56,7 @@ class BlockingQueue {
     std::unique_lock<std::mutex> lock(mutex_);
     send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
     if (closed_) {
-      VLOG(50)
+      VLOG(5)
           << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
       return false;
     }
diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
new file mode 100644
index 0000000000..58a465d87a
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/ctr_reader.h"
+
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class CreateCTRReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        queue_holder_var,
+        "No LoDTensorBlockingQueueHolder variable with name %s found",
+        queue_name);
+    auto* queue_holder =
+        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
+
+    int thread_num = Attr<int>("thread_num");
+    std::vector<std::string> slots = Attr<std::vector<std::string>>("slots");
+    int batch_size = Attr<int>("batch_size");
+    std::vector<std::string> file_list =
+        Attr<std::vector<std::string>>("file_list");
+    out->Reset(std::make_shared<CTRReader>(queue_holder->GetQueue(), batch_size,
+                                           thread_num, slots, file_list));
+  }
+};
+
+class CreateCTRReaderOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddInput("blocking_queue",
+             "Name of the `LoDTensorBlockingQueueHolder` variable");
+    AddAttr<int>("thread_num", "the thread num to read data");
+    AddAttr<int>("batch_size", "the batch size of read data");
+    AddAttr<std::vector<std::string>>("file_list",
+                                      "The list of files that need to read");
+    AddAttr<std::vector<std::string>>(
+        "slots", "the slots that should be extract from file");
+
+    AddComment(R"DOC(
+			Create CTRReader to support read ctr data with cpp.
+      )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = ::paddle::operators::reader;
+
+REGISTER_FILE_READER_OPERATOR(create_ctr_reader, reader::CreateCTRReaderOp,
+                              reader::CreateCTRReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 3fe4e9e7ad..3f72890a7c 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -26,7 +26,7 @@ class ShuffleReader : public framework::DecoratedReader {
   ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
                 size_t seed = 0)
       : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
-    VLOG(100) << "Create shuffle reader of " << reader_;
+    VLOG(10) << "Create shuffle reader of " << reader_;
     if (seed_ == 0) {
       std::random_device device;
       seed_ = device();
@@ -37,7 +37,7 @@ class ShuffleReader : public framework::DecoratedReader {
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     out->clear();
     if (iteration_pos_ >= buffer_.size()) {
-      VLOG(100) << "Resetting shuffle buffer";
+      VLOG(10) << "Resetting shuffle buffer";
       ReloadBuffer();
       if (buffer_.empty()) {
         return;
@@ -73,7 +73,7 @@ class ShuffleReader : public framework::DecoratedReader {
     std::mt19937 g(seed_);
     std::shuffle(buffer_.begin(), buffer_.end(), g);
     seed_ = g();  // update seed_;
-    VLOG(100) << "random buffer size = " << buffer_.size();
+    VLOG(10) << "random buffer size = " << buffer_.size();
   }
 
   size_t buffer_size_;
diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc
new file mode 100644
index 0000000000..d1d3ddc89d
--- /dev/null
+++ b/paddle/fluid/operators/reader/ctr_reader.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/ctr_reader.h"
+
+#include <gzstream.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include <algorithm>
+#include <random>
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+static inline void string_split(const std::string& s, const char delimiter,
+                                std::vector<std::string>* output) {
+  size_t start = 0;
+  size_t end = s.find_first_of(delimiter);
+
+  while (end <= std::string::npos) {
+    output->emplace_back(s.substr(start, end - start));
+    if (end == std::string::npos) {
+      break;
+    }
+    start = end + 1;
+    end = s.find_first_of(delimiter, start);
+  }
+}
+
+static inline void parse_line(
+    const std::string& line,
+    const std::unordered_map<std::string, size_t>& slot_to_index,
+    int64_t* label,
+    std::unordered_map<std::string, std::vector<int64_t>>* slot_to_data) {
+  std::vector<std::string> ret;
+  string_split(line, ' ', &ret);
+  *label = std::stoi(ret[2]) > 0;
+
+  for (size_t i = 3; i < ret.size(); ++i) {
+    const std::string& item = ret[i];
+    std::vector<std::string> feasign_and_slot;
+    string_split(item, ':', &feasign_and_slot);
+    if (feasign_and_slot.size() == 2 &&
+        slot_to_index.find(feasign_and_slot[1]) != slot_to_index.end()) {
+      int64_t feasign = std::strtoll(feasign_and_slot[0].c_str(), NULL, 10);
+      (*slot_to_data)[feasign_and_slot[1]].push_back(feasign);
+    }
+  }
+
+  // NOTE:: if the slot has no value, then fill [0] as it's data.
+  for (auto& item : slot_to_index) {
+    if (slot_to_data->find(item.first) == slot_to_data->end()) {
+      (*slot_to_data)[item.first].push_back(0);
+    }
+  }
+}
+
+class Reader {
+ public:
+  virtual ~Reader() {}
+  virtual bool HasNext() = 0;
+  virtual void NextLine(std::string* line) = 0;
+};
+
+class GzipReader : public Reader {
+ public:
+  explicit GzipReader(const std::string& file_name)
+      : gzstream_(file_name.c_str()) {}
+
+  ~GzipReader() {}
+
+  bool HasNext() override { return gzstream_.peek() != EOF; }
+
+  void NextLine(std::string* line) override { std::getline(gzstream_, *line); }
+
+ private:
+  igzstream gzstream_;
+};
+
+class MultiGzipReader : public Reader {
+ public:
+  explicit MultiGzipReader(const std::vector<std::string>& file_list) {
+    for (auto& file : file_list) {
+      readers_.emplace_back(std::make_shared<GzipReader>(file));
+    }
+  }
+
+  bool HasNext() override {
+    if (current_reader_index_ >= readers_.size()) {
+      return false;
+    }
+    if (!readers_[current_reader_index_]->HasNext()) {
+      current_reader_index_++;
+      return HasNext();
+    }
+    return true;
+  }
+
+  void NextLine(std::string* line) override {
+    readers_[current_reader_index_]->NextLine(line);
+  }
+
+ private:
+  std::vector<std::shared_ptr<GzipReader>> readers_;
+  size_t current_reader_index_ = 0;
+};
+
+void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
+                   std::shared_ptr<LoDTensorBlockingQueue> queue) {
+  VLOG(30) << "monitor thread in";
+  bool reader_thread_is_running = true;
+  while (reader_thread_is_running) {
+    VLOG(30) << "reader_thread_is_running";
+    reader_thread_is_running = false;
+    for (size_t i = 0; i < (*thread_status).size(); ++i) {
+      if ((*thread_status)[i] == Running) {
+        VLOG(30) << "reader is running!";
+        reader_thread_is_running = true;
+      }
+    }
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  }
+  VLOG(30) << "all reader thread is stopped, push empty data into queue";
+  queue->Push({});
+  VLOG(30) << "monitor thread exited";
+}
+
+void ReadThread(const std::vector<std::string>& file_list,
+                const std::vector<std::string>& slots, int batch_size,
+                int thread_id, std::vector<ReaderThreadStatus>* thread_status,
+                std::shared_ptr<LoDTensorBlockingQueue> queue) {
+  VLOG(30) << "[" << thread_id << "]"
+           << " reader thread start! thread_id = " << thread_id;
+  for (auto& file : file_list) {
+    VLOG(30) << "[" << thread_id << "]"
+             << " file " << file;
+  }
+  (*thread_status)[thread_id] = Running;
+  VLOG(30) << "set status to running";
+
+  std::unordered_map<std::string, size_t> slot_to_index;
+  for (size_t i = 0; i < slots.size(); ++i) {
+    slot_to_index[slots[i]] = i;
+  }
+
+  std::string line;
+
+  std::vector<std::unordered_map<std::string, std::vector<int64_t>>> batch_data;
+  std::vector<int64_t> batch_label;
+
+  MultiGzipReader reader(file_list);
+
+  VLOG(30) << "reader inited";
+
+  while (reader.HasNext()) {
+    batch_data.clear();
+    batch_data.reserve(batch_size);
+
+    batch_label.clear();
+    batch_label.reserve(batch_size);
+
+    // read batch_size data
+    for (int i = 0; i < batch_size; ++i) {
+      if (reader.HasNext()) {
+        reader.NextLine(&line);
+        std::unordered_map<std::string, std::vector<int64_t>> slot_to_data;
+        int64_t label;
+        parse_line(line, slot_to_index, &label, &slot_to_data);
+        batch_data.push_back(slot_to_data);
+        batch_label.push_back(label);
+      } else {
+        break;
+      }
+    }
+
+    std::vector<framework::LoDTensor> lod_datas;
+
+    // first insert tensor for each slots
+    for (auto& slot : slots) {
+      std::vector<size_t> lod_data{0};
+      std::vector<int64_t> batch_feasign;
+
+      for (size_t i = 0; i < batch_data.size(); ++i) {
+        auto& feasign = batch_data[i][slot];
+        lod_data.push_back(lod_data.back() + feasign.size());
+        batch_feasign.insert(batch_feasign.end(), feasign.begin(),
+                             feasign.end());
+      }
+
+      framework::LoDTensor lod_tensor;
+      framework::LoD lod{lod_data};
+      lod_tensor.set_lod(lod);
+      int64_t* tensor_data = lod_tensor.mutable_data<int64_t>(
+          framework::make_ddim({1, static_cast<int64_t>(batch_feasign.size())}),
+          platform::CPUPlace());
+      memcpy(tensor_data, batch_feasign.data(),
+             batch_feasign.size() * sizeof(int64_t));
+      lod_datas.push_back(lod_tensor);
+    }
+
+    // insert label tensor
+    framework::LoDTensor label_tensor;
+    auto* label_tensor_data = label_tensor.mutable_data<int64_t>(
+        framework::make_ddim({1, static_cast<int64_t>(batch_label.size())}),
+        platform::CPUPlace());
+    memcpy(label_tensor_data, batch_label.data(),
+           batch_label.size() * sizeof(int64_t));
+    lod_datas.push_back(label_tensor);
+
+    queue->Push(lod_datas);
+    VLOG(40) << "push one data, queue_size=" << queue->Size();
+  }
+
+  (*thread_status)[thread_id] = Stopped;
+  VLOG(30) << "set status to stopped, thread " << thread_id << " exited";
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h
new file mode 100644
index 0000000000..9b2a11bae1
--- /dev/null
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <sys/time.h>
+
+#include <chrono>  // NOLINT
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+enum ReaderThreadStatus { Running, Stopped };
+
+void ReadThread(const std::vector<std::string>& file_list,
+                const std::vector<std::string>& slots, int batch_size,
+                int thread_id, std::vector<ReaderThreadStatus>* thread_status,
+                std::shared_ptr<LoDTensorBlockingQueue> queue);
+
+// monitor all running thread, if they are all stopped,
+// then push an empty data into LoDTensorBlockingQueue
+void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
+                   std::shared_ptr<LoDTensorBlockingQueue> queue);
+
+class CTRReader : public framework::FileReader {
+ public:
+  explicit CTRReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue,
+                     int batch_size, int thread_num,
+                     const std::vector<std::string>& slots,
+                     const std::vector<std::string>& file_list)
+      : batch_size_(batch_size), slots_(slots), file_list_(file_list) {
+    PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!");
+    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
+    PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty");
+    thread_num_ =
+        file_list_.size() > thread_num ? thread_num : file_list_.size();
+    queue_ = queue;
+    SplitFiles();
+    for (size_t i = 0; i < thread_num_; ++i) {
+      read_thread_status_.push_back(Stopped);
+    }
+  }
+
+  ~CTRReader() {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+
+  void Shutdown() override {
+    VLOG(3) << "Shutdown reader";
+    if (status_ == ReaderStatus::kStopped) {
+      return;
+    }
+    // shutdown should stop all the reader thread
+    for (auto& read_thread : read_threads_) {
+      read_thread->join();
+    }
+    monitor_thread_->join();
+
+    read_threads_.clear();
+    monitor_thread_.reset(nullptr);
+    queue_->Close();
+    status_ = ReaderStatus::kStopped;
+  }
+
+  void Start() override {
+    VLOG(3) << "Start reader";
+    PADDLE_ENFORCE_EQ(read_threads_.size(), 0, "read thread should be empty!");
+    queue_->ReOpen();
+    VLOG(3) << "reopen success";
+    VLOG(3) << "thread_num " << thread_num_;
+    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
+      read_threads_.emplace_back(new std::thread(
+          std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_,
+                    thread_id, &read_thread_status_, queue_)));
+    }
+    monitor_thread_.reset(new std::thread(
+        std::bind(&MonitorThread, &read_thread_status_, queue_)));
+    status_ = ReaderStatus::kRunning;
+  }
+
+ private:
+  void SplitFiles() {
+    file_groups_.resize(thread_num_);
+    for (size_t i = 0; i < file_list_.size(); ++i) {
+      auto& file_name = file_list_[i];
+      std::ifstream f(file_name.c_str());
+      PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name);
+      file_groups_[i % thread_num_].push_back(file_name);
+    }
+  }
+
+ private:
+  size_t thread_num_;
+  const int batch_size_;
+  const std::vector<std::string> slots_;
+  const std::vector<std::string> file_list_;
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+  std::vector<std::unique_ptr<std::thread>> read_threads_;
+  std::unique_ptr<std::thread> monitor_thread_;
+  std::vector<ReaderThreadStatus> read_thread_status_;
+  std::vector<std::vector<std::string>> file_groups_;
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc
new file mode 100644
index 0000000000..8dba9baebc
--- /dev/null
+++ b/paddle/fluid/operators/reader/ctr_reader_test.cc
@@ -0,0 +1,155 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/ctr_reader.h"
+
+#include <gzstream.h>
+#include <time.h>
+
+#include <math.h>
+#include <stdio.h>
+#include <cstring>
+#include <fstream>
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+
+using paddle::operators::reader::LoDTensorBlockingQueue;
+using paddle::operators::reader::LoDTensorBlockingQueueHolder;
+using paddle::operators::reader::CTRReader;
+using paddle::framework::LoDTensor;
+using paddle::framework::LoD;
+using paddle::framework::DDim;
+using paddle::platform::CPUPlace;
+using paddle::framework::make_ddim;
+
+static void generatedata(const std::vector<std::string>& data,
+                         const std::string& file_name) {
+  std::ifstream in(file_name.c_str());
+  if (in.good()) {
+    VLOG(3) << "file " << file_name << " exist, delete it first!";
+    remove(file_name.c_str());
+  } else {
+    in.close();
+  }
+
+  ogzstream out(file_name.c_str());
+  PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name);
+  for (auto& c : data) {
+    out << c;
+  }
+  out.close();
+  PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name);
+}
+
+static inline void check_all_data(
+    const std::vector<std::string>& ctr_data,
+    const std::vector<std::string>& slots, const std::vector<DDim>& label_dims,
+    const std::vector<int64_t>& label_value,
+    const std::vector<std::tuple<LoD, std::vector<int64_t>>>& data_slot_6002,
+    const std::vector<std::tuple<LoD, std::vector<int64_t>>>& data_slot_6003,
+    size_t batch_num, size_t batch_size,
+    std::shared_ptr<LoDTensorBlockingQueue> queue, CTRReader* reader) {
+  std::vector<LoDTensor> out;
+  for (size_t i = 0; i < batch_num; ++i) {
+    reader->ReadNext(&out);
+    ASSERT_EQ(out.size(), slots.size() + 1);
+    auto& label_tensor = out.back();
+    ASSERT_EQ(label_tensor.dims(), label_dims[i]);
+    for (size_t j = 0; j < batch_size && i * batch_num + j < ctr_data.size();
+         ++j) {
+      auto& label = label_tensor.data<int64_t>()[j];
+      ASSERT_TRUE(label == 0 || label == 1);
+      ASSERT_EQ(label, label_value[i * batch_size + j]);
+    }
+    auto& tensor_6002 = out[0];
+    ASSERT_EQ(std::get<0>(data_slot_6002[i]), tensor_6002.lod());
+    ASSERT_EQ(std::memcmp(std::get<1>(data_slot_6002[i]).data(),
+                          tensor_6002.data<int64_t>(),
+                          tensor_6002.dims()[1] * sizeof(int64_t)),
+              0);
+  }
+  reader->ReadNext(&out);
+  ASSERT_EQ(out.size(), 0);
+  ASSERT_EQ(queue->Size(), 0);
+}
+
+TEST(CTR_READER, read_data) {
+  const std::vector<std::string> ctr_data = {
+      "aaaa 1 0 0:6002 1:6003 2:6004 3:6005 4:6006 -1\n",
+      "bbbb 1 0 5:6003 6:6003 7:6003 8:6004 9:6004 -1\n",
+      "cccc 1 1 10:6002 11:6002 12:6002 13:6002 14:6002 -2\n",
+      "dddd 1 0 15:6003 16:6003 17:6003 18:6003 19:6004 -3\n",
+      "1111 1 1 20:6001 21:6001 22:6001 23:6001 24:6001 12\n",
+      "2222 1 1 25:6004 26:6004 27:6004 28:6005 29:6005 aa\n",
+      "3333 1 0 30:6002 31:6003 32:6004 33:6004 34:6005 er\n",
+      "eeee 1 1 35:6003 36:6003 37:6005 38:6005 39:6005 dd\n",
+      "ffff 1 1 40:6002 41:6003 42:6004 43:6004 44:6005 66\n",
+      "gggg 1 1 46:6006 45:6006 47:6003 48:6003 49:6003 ba\n",
+  };
+  std::string gz_file_name = "test_ctr_reader_data.gz";
+  generatedata(ctr_data, gz_file_name);
+
+  std::vector<int64_t> label_value = {0, 0, 1, 0, 1, 1, 0, 1, 1, 1};
+
+  std::tuple<LoD, std::vector<int64_t>> a1({{0, 1, 2, 7}},
+                                           {0, 0, 10, 11, 12, 13, 14});
+  std::tuple<LoD, std::vector<int64_t>> a2({{0, 1, 2, 3}}, {0, 0, 0});
+  std::tuple<LoD, std::vector<int64_t>> a3({{0, 1, 2, 3}}, {30, 0, 40});
+  std::tuple<LoD, std::vector<int64_t>> a4({{0, 1}}, {0});
+  std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6002{a1, a2, a3,
+                                                                    a4};
+
+  std::tuple<LoD, std::vector<int64_t>> b1({{0, 1, 4, 5}}, {1, 5, 6, 7, 0});
+  std::tuple<LoD, std::vector<int64_t>> b2({{0, 4, 5, 6}},
+                                           {15, 16, 17, 18, 0, 0});
+  std::tuple<LoD, std::vector<int64_t>> b3({{0, 1, 3, 4}}, {31, 35, 36, 41});
+  std::tuple<LoD, std::vector<int64_t>> b4({{0, 3}}, {47, 48, 49});
+  std::vector<std::tuple<LoD, std::vector<int64_t>>> data_slot_6003{b1, b2, b3,
+                                                                    b4};
+
+  std::vector<DDim> label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}};
+
+  LoDTensorBlockingQueueHolder queue_holder;
+  int capacity = 64;
+  queue_holder.InitOnce(capacity, {}, false);
+
+  std::shared_ptr<LoDTensorBlockingQueue> queue = queue_holder.GetQueue();
+
+  int batch_size = 3;
+  int thread_num = 1;
+  std::vector<std::string> slots = {"6002", "6003"};
+  std::vector<std::string> file_list;
+  for (int i = 0; i < thread_num; ++i) {
+    file_list.push_back(gz_file_name);
+  }
+
+  CTRReader reader(queue, batch_size, thread_num, slots, file_list);
+
+  reader.Start();
+  size_t batch_num =
+      std::ceil(static_cast<float>(ctr_data.size()) / batch_size) * thread_num;
+  check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002,
+                 data_slot_6003, batch_num, batch_size, queue, &reader);
+
+  reader.Shutdown();
+
+  reader.Start();
+  check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002,
+                 data_slot_6003, batch_num, batch_size, queue, &reader);
+  reader.Shutdown();
+}
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 283dce9321..162bfcbb08 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -160,7 +160,7 @@ class RecurrentBase : public framework::OperatorBase {
                                      Callback callback) {
     PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
     for (size_t i = 0; i < dst_vars.size(); ++i) {
-      VLOG(100) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
       AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
     }
   }
@@ -176,7 +176,7 @@ class RecurrentBase : public framework::OperatorBase {
                                      Callback callback) {
     PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
     for (size_t i = 0; i < dst_vars.size(); ++i) {
-      VLOG(100) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
       AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
     }
   }
@@ -230,7 +230,7 @@ class RecurrentOp : public RecurrentBase {
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
-    VLOG(30) << "Static RNN input sequence length = " << seq_len;
+    VLOG(3) << "Static RNN input sequence length = " << seq_len;
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
@@ -241,7 +241,7 @@ class RecurrentOp : public RecurrentBase {
 
     for (size_t i = 0; i < seq_len; ++i) {
       size_t seq_offset = reverse ? seq_len - i - 1 : i;
-      VLOG(30) << "Recurrent operate at the time step " << seq_offset;
+      VLOG(3) << "Recurrent operate at the time step " << seq_offset;
 
       auto &cur_scope = scopes.CurScope();
 
@@ -334,7 +334,7 @@ class RecurrentGradOp : public RecurrentBase {
 
     for (size_t step_id = 0; step_id < seq_len; ++step_id) {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
-      VLOG(30) << "Recurrent backward operate at the time step " << seq_offset;
+      VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
       auto &cur_scope = scopes.CurScope();
       // Link outside::output_grads --> inside::output_grads
       //   inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
@@ -348,11 +348,11 @@ class RecurrentGradOp : public RecurrentBase {
           });
       auto og_set = List2Set(Inputs(kOutputGrads));
 
-      if (VLOG_IS_ON(100)) {
+      if (VLOG_IS_ON(10)) {
         std::ostringstream sout;
         std::copy(og_set.begin(), og_set.end(),
                   std::ostream_iterator<std::string>(sout, ","));
-        VLOG(100) << " RNN output gradients = [" << sout.str() << "]";
+        VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
       }
 
       // Link states
@@ -374,7 +374,7 @@ class RecurrentGradOp : public RecurrentBase {
           auto &ex_tensor =
               ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
 
-          VLOG(100) << " RNN link " << cur_grad << " from " << ex_grad;
+          VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
           auto *cur_grad_var = cur_scope.Var(cur_grad);
           auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
@@ -382,12 +382,12 @@ class RecurrentGradOp : public RecurrentBase {
         }
       }
 
-      VLOG(50) << "Recurrent memory linking finished ";
+      VLOG(5) << "Recurrent memory linking finished ";
       // Run step block with cur_scope
       executor.Run(*program, &cur_scope, block->ID(),
                    false /*create_local_scope*/);
 
-      VLOG(50) << "executor.Run finished ";
+      VLOG(5) << "executor.Run finished ";
 
       auto local_var_names = LocalVarNames(cur_scope);
 
@@ -436,7 +436,7 @@ class RecurrentGradOp : public RecurrentBase {
           cur_scope.Rename(new_inside_name, inside_grad_name);
         }
       }
-      VLOG(50) << "Accumulate Parameter finished ";
+      VLOG(5) << "Accumulate Parameter finished ";
 
       // Copy input gradient from inside to outside
       //   outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
@@ -455,7 +455,7 @@ class RecurrentGradOp : public RecurrentBase {
             auto dst = outside->Slice(seq_offset, seq_offset + 1);
             framework::TensorCopy(inside, place, dev_ctx, &dst);
           });
-      VLOG(50) << "Link outside gradient finished ";
+      VLOG(5) << "Link outside gradient finished ";
 
       if (step_id + 1 == seq_len) {  // at_end
         // copy initialize states gradient from inside to outside
@@ -468,7 +468,7 @@ class RecurrentGradOp : public RecurrentBase {
               outside->mutable_data(place, inside.type());
               framework::TensorCopy(inside, place, dev_ctx, outside);
             });
-        VLOG(50) << "Link initialize state gradient finished ";
+        VLOG(5) << "Link initialize state gradient finished ";
       }
       scopes.Next();
     }
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index e4f4fe358e..7ceb5b5846 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -201,6 +201,9 @@ class IdentityInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
     context->SetOutputDim("Out", context->GetInputDim("X"));
+    if (!context->IsRuntime()) {
+      context->ShareLoD("X", /*->*/ "Out");
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index b840e69096..0fb7776fd9 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -93,7 +93,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                    in_grad_var_name);
 
     if (out_grad_var == nullptr) {
-      VLOG(50) << "Using fill constant 0 as starting gradient";
+      VLOG(5) << "Using fill constant 0 as starting gradient";
       auto in_var_name = Input("X");
       auto *in_var = scope.FindVar(in_var_name);
       auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 0dcf3f0e37..e79cffcf49 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -110,7 +110,7 @@ class SaveOp : public framework::OperatorBase {
         lt_var != nullptr,
         "Can not find variable kLookupTablePath for SaveSelectedRows");
     std::string filename = lt_var->data();
-    VLOG(40) << "SaveSelectedRows get File name: " << filename;
+    VLOG(4) << "SaveSelectedRows get File name: " << filename;
 
     MkDirRecursively(DirName(filename).c_str());
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
index 7ff68f9c71..18acb735ce 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
@@ -127,7 +127,7 @@ class SequenceMaskKernel : public framework::OpKernel<Tx> {
     auto x_numel = x->numel();
     if (maxlen < 0) {
 #ifdef __NVCC__
-      VLOG(100)
+      VLOG(10)
           << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
       maxlen = static_cast<int>(
           thrust::reduce(thrust::device_pointer_cast(x_data),
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index e1c74c3a2f..2e2aea2c63 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -100,6 +100,9 @@ class ShrinkRNNMemoryInferShape : public framework::InferShapeBase {
     PADDLE_ENFORCE(context->HasInput("I"));
     PADDLE_ENFORCE(context->HasInput("RankTable"));
     context->SetOutputDim("Out", context->GetInputDim("X"));
+    if (!context->IsRuntime()) {
+      context->DecreaseLoDLevel("X", /*->*/ "Out");
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 193de05422..14746fa951 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
+const int kIgnoreIndex = -100;
 
 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  public:
@@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker
     AddOutput("Out",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
               " of elementwise logistic losses.");
+    AddAttr<int>("ignore_index",
+                 "(int, default kIgnoreIndex), Specifies a target value that "
+                 "is ignored and"
+                 "does not contribute to the input gradient.")
+        .SetDefault(kIgnoreIndex);
     AddComment(R"DOC(
 SigmoidCrossEntropyWithLogits Operator.
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index faef72866e..b8731c2327 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -15,33 +15,72 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct SigmoidCrossEntropyWithLogitsForward {
+  HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
+      : ignore_index(ignore_index) {}
+
+  HOSTDEVICE T operator()(const T &x, const T &label) const {
+    if (static_cast<int>(label) == ignore_index) {
+      return static_cast<T>(0.);
+    }
+    T term1 = (x > 0) ? x : 0;
+    T term2 = x * label;
+    T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
+    return term1 - term2 + term3;
+  }
+
+  int ignore_index;
+};
+
+template <typename T>
+struct SigmoidCrossEntropyWithLogitsBackward {
+  HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
+      : ignore_index(ignore_index) {}
+
+  HOSTDEVICE T operator()(const T &x, const T &label) const {
+    if (static_cast<int>(label) == ignore_index) {
+      return static_cast<T>(0.);
+    }
+    T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+    return simoid_x - label;
+  }
+
+  int ignore_index;
+};
+
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    Tensor *Out = context.Output<Tensor>("Out");
     Out->mutable_data<T>(context.GetPlace());
+    int ignore_index = context.Attr<int>("ignore_index");
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto x = EigenVector<T>::Flatten(*X);
+    auto labels = EigenVector<T>::Flatten(*Labels);
+    auto out = EigenVector<T>::Flatten(*Out);
     auto &place = *context.device_context<DeviceContext>().eigen_device();
 
-    // term1 = max(x, 0)
-    auto term1 = x.cwiseMax(static_cast<T>(0));
-    // term2 = x * labels
-    auto term2 = x * labels;
-    // term3 = log(1 + exp(-abs(x)))
-    auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
-
-    out.device(place) = term1 - term2 + term3;
+    out.device(place) = x.binaryExpr(
+        labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
   }
 };
 
@@ -50,23 +89,23 @@ template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-    const framework::Tensor *dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor *dX =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
     dX->mutable_data<T>(context.GetPlace());
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto ignore_index = context.Attr<int>("ignore_index");
+    auto x = EigenVector<T>::Flatten(*X);
+    auto labels = EigenVector<T>::Flatten(*Labels);
+    auto dout = EigenVector<T>::Flatten(*dOut);
+    auto dx = EigenVector<T>::Flatten(*dX);
     auto &place =
         *context.template device_context<DeviceContext>().eigen_device();
 
-    auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-    dx.device(place) = dout * (sigmoid_x - labels);
+    auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
+                                         static_cast<int>(ignore_index)));
+    dx.device(place) = dout * diff;
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index 01819f53e3..d2b1495354 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <iostream>
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 8eb5c7691e..91829d5761 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -36,9 +36,7 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
 #ifdef PADDLE_ON_INFERENCE
-    math::SoftmaxFunctor<
-        DeviceContext, T,
-        std::is_same<DeviceContext, platform::CPUDeviceContext>::value>()(
+    math::SoftmaxFunctor<DeviceContext, T, true>()(
         context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
 #else
     math::SoftmaxFunctor<DeviceContext, T, false>()(
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc
index 2ae5c17bf6..f9a16ef35e 100644
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -186,7 +186,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       }
 
       if (in_dim.empty()) {
-        VLOG(30) << "WARNING: all the inputs are empty";
+        VLOG(3) << "WARNING: all the inputs are empty";
         in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
       } else {
         in_dim[0] = static_cast<int64_t>(first_dim);
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index c67b694283..7df14158f3 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -45,7 +45,7 @@ class SumOp : public framework::OperatorWithKernel {
     size_t N = x_dims.size();
     PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
     if (N == 1) {
-      VLOG(30) << "Warning: sum have only one input, may waste memory";
+      VLOG(3) << "Warning: sum have only one input, may waste memory";
     }
 
     framework::DDim in_dim({0});
@@ -157,8 +157,8 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
     auto& inputs = op_desc.Input("X");
     auto var_type = framework::proto::VarType::SELECTED_ROWS;
     for (auto& name : op_desc.Input("X")) {
-      VLOG(100) << name << " "
-                << block->FindRecursiveOrCreateVar(name).GetType();
+      VLOG(10) << name << " "
+               << block->FindRecursiveOrCreateVar(name).GetType();
     }
 
     bool any_input_is_lod_tensor = std::any_of(
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 3af9376da1..6eef4c98c4 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -127,9 +127,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 
     // Convert output tensor from engine to fluid
     int output_index = 0;
-    VLOG(40) << "TensorRT Engine Op Outputs:";
+    VLOG(4) << "TensorRT Engine Op Outputs:";
     for (const auto& y : context.Outputs("Ys")) {
-      VLOG(40) << y;
+      VLOG(4) << y;
       // convert output and copy to fluid.
       nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
       auto dims = trt_t->getDimensions();
@@ -167,7 +167,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 
  protected:
   void Prepare(const framework::ExecutionContext& context) const {
-    VLOG(40) << "Prepare engine";
+    VLOG(4) << "Prepare engine";
     // Get the ProgramDesc and pass to convert.
     framework::proto::BlockDesc block_desc;
     block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
@@ -192,12 +192,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
     engine->InitNetwork();
 
     framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
-    VLOG(40) << "parsed var size " << block.AllVars().size();
+    VLOG(4) << "parsed var size " << block.AllVars().size();
     // Add inputs
-    VLOG(40) << "declare inputs";
+    VLOG(4) << "declare inputs";
     for (auto& input : context.Inputs("Xs")) {
       if (parameters.count(input)) continue;
-      VLOG(40) << "declare input " << input;
+      VLOG(4) << "declare input " << input;
       auto* var = block.FindVar(input);
       // TensorRT engine need to create parameters. The parameter's description
       // should be set in
diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc
new file mode 100644
index 0000000000..e7597f7324
--- /dev/null
+++ b/paddle/fluid/operators/yolov3_loss_op.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/yolov3_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class Yolov3LossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GTBox"),
+                   "Input(GTBox) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GTLabel"),
+                   "Input(GTLabel) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
+                   "Output(Loss) of Yolov3LossOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");
+    auto dim_gtbox = ctx->GetInputDim("GTBox");
+    auto dim_gtlabel = ctx->GetInputDim("GTLabel");
+    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    auto class_num = ctx->Attrs().Get<int>("class_num");
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
+    PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
+                      "Input(X) dim[3] and dim[4] should be euqal.");
+    PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num),
+                      "Input(X) dim[1] should be equal to (anchor_number * (5 "
+                      "+ class_num)).");
+    PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
+                      "Input(GTBox) should be a 3-D tensor");
+    PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5");
+    PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2,
+                      "Input(GTBox) should be a 2-D tensor");
+    PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0],
+                      "Input(GTBox) and Input(GTLabel) dim[0] should be same");
+    PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1],
+                      "Input(GTBox) and Input(GTLabel) dim[1] should be same");
+    PADDLE_ENFORCE_GT(anchors.size(), 0,
+                      "Attr(anchors) length should be greater then 0.");
+    PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
+                      "Attr(anchors) length should be even integer.");
+    PADDLE_ENFORCE_GT(class_num, 0,
+                      "Attr(class_num) should be an integer greater then 0.");
+
+    std::vector<int64_t> dim_out({1});
+    ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of YOLO v3 loss operator, "
+             "This is a 4-D tensor with shape of [N, C, H, W]."
+             "H and W should be same, and the second dimention(C) stores"
+             "box locations, confidence score and classification one-hot"
+             "key of each anchor box");
+    AddInput("GTBox",
+             "The input tensor of ground truth boxes, "
+             "This is a 3-D tensor with shape of [N, max_box_num, 5], "
+             "max_box_num is the max number of boxes in each image, "
+             "In the third dimention, stores x, y, w, h coordinates, "
+             "x, y is the center cordinate of boxes and w, h is the "
+             "width and height and x, y, w, h should be divided by "
+             "input image height to scale to [0, 1].");
+    AddInput("GTLabel",
+             "The input tensor of ground truth label, "
+             "This is a 2-D tensor with shape of [N, max_box_num], "
+             "and each element shoudl be an integer to indicate the "
+             "box class id.");
+    AddOutput("Loss",
+              "The output yolov3 loss tensor, "
+              "This is a 1-D tensor with shape of [1]");
+
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddAttr<std::vector<int>>("anchors",
+                              "The anchor width and height, "
+                              "it will be parsed pair by pair.");
+    AddAttr<float>("ignore_thresh",
+                   "The ignore threshold to ignore confidence loss.");
+    AddAttr<float>("loss_weight_xy", "The weight of x, y location loss.")
+        .SetDefault(1.0);
+    AddAttr<float>("loss_weight_wh", "The weight of w, h location loss.")
+        .SetDefault(1.0);
+    AddAttr<float>(
+        "loss_weight_conf_target",
+        "The weight of confidence score loss in locations with target object.")
+        .SetDefault(1.0);
+    AddAttr<float>("loss_weight_conf_notarget",
+                   "The weight of confidence score loss in locations without "
+                   "target object.")
+        .SetDefault(1.0);
+    AddAttr<float>("loss_weight_class", "The weight of classification loss.")
+        .SetDefault(1.0);
+    AddComment(R"DOC(
+         This operator generate yolov3 loss by given predict result and ground
+         truth boxes.
+         
+         The output of previous network is in shape [N, C, H, W], while H and W
+         should be the same, specify the grid size, each grid point predict given
+         number boxes, this given number is specified by anchors, it should be 
+         half anchors length, which following will be represented as S. In the 
+         second dimention(the channel dimention), C should be S * (class_num + 5),
+         class_num is the box categoriy number of source dataset(such as coco), 
+         so in the second dimention, stores 4 box location coordinates x, y, w, h 
+         and confidence score of the box and class one-hot key of each anchor box.
+
+         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
+         correspnd to:
+
+         $$
+         b_x = \sigma(t_x) + c_x
+         b_y = \sigma(t_y) + c_y
+         b_w = p_w e^{t_w}
+         b_h = p_h e^{t_h}
+         $$
+
+         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
+         is specified by anchors.
+
+         As for confidence score, it is the logistic regression value of IoU between
+         anchor boxes and ground truth boxes, the score of the anchor box which has 
+         the max IoU should be 1, and if the anchor box has IoU bigger then ignore 
+         thresh, the confidence score loss of this anchor box will be ignored.
+
+         Therefore, the yolov3 loss consist of three major parts, box location loss,
+         confidence score loss, and classification loss. The MSE loss is used for 
+         box location, and binary cross entropy loss is used for confidence score 
+         loss and classification loss.
+
+         Final loss will be represented as follow.
+
+         $$
+         loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh}
+              + \loss_weight_{conf_target} * loss_{conf_target}
+              + \loss_weight_{conf_notarget} * loss_{conf_notarget}
+              + \loss_weight_{class} * loss_{class}
+         $$
+         )DOC");
+  }
+};
+
+class Yolov3LossOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("yolov3_loss_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("GTBox", Input("GTBox"));
+    op->SetInput("GTLabel", Input("GTLabel"));
+    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("GTBox"), {});
+    op->SetOutput(framework::GradVarName("GTLabel"), {});
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker,
+                  ops::Yolov3LossGradMaker);
+REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad);
+REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel<float>,
+                       ops::Yolov3LossKernel<double>);
+REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel<float>,
+                       ops::Yolov3LossGradKernel<double>);
diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h
new file mode 100644
index 0000000000..0bb285722d
--- /dev/null
+++ b/paddle/fluid/operators/yolov3_loss_op.h
@@ -0,0 +1,483 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+using Array5 = Eigen::DSizes<int64_t, 5>;
+
+template <typename T>
+static inline bool isZero(T x) {
+  return fabs(x) < 1e-6;
+}
+
+template <typename T>
+static inline T sigmoid(T x) {
+  return 1.0 / (exp(-1.0 * x) + 1.0);
+}
+
+template <typename T>
+static inline T CalcMaskPointNum(const Tensor& mask) {
+  auto mask_t = EigenVector<int>::Flatten(mask);
+  T count = 0.0;
+  for (int i = 0; i < mask_t.dimensions()[0]; i++) {
+    if (mask_t(i)) {
+      count += 1.0;
+    }
+  }
+  return count;
+}
+
+template <typename T>
+static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y,
+                                const Tensor& mask) {
+  auto x_t = EigenVector<T>::Flatten(x);
+  auto y_t = EigenVector<T>::Flatten(y);
+  auto mask_t = EigenVector<int>::Flatten(mask);
+
+  T error_sum = 0.0;
+  T points = 0.0;
+  for (int i = 0; i < x_t.dimensions()[0]; i++) {
+    if (mask_t(i)) {
+      error_sum += pow(x_t(i) - y_t(i), 2);
+      points += 1;
+    }
+  }
+  return (error_sum / points);
+}
+
+template <typename T>
+static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y,
+                                const Tensor& mask, T mf) {
+  auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
+  auto x_t = EigenVector<T>::Flatten(x);
+  auto y_t = EigenVector<T>::Flatten(y);
+  auto mask_t = EigenVector<int>::Flatten(mask);
+
+  for (int i = 0; i < x_t.dimensions()[0]; i++) {
+    if (mask_t(i)) {
+      grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf;
+    }
+  }
+}
+
+template <typename T>
+static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y,
+                                const Tensor& mask) {
+  auto x_t = EigenVector<T>::Flatten(x);
+  auto y_t = EigenVector<T>::Flatten(y);
+  auto mask_t = EigenVector<int>::Flatten(mask);
+
+  T error_sum = 0.0;
+  T points = 0.0;
+  for (int i = 0; i < x_t.dimensions()[0]; i++) {
+    if (mask_t(i)) {
+      error_sum +=
+          -1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i)));
+      points += 1;
+    }
+  }
+  return (error_sum / points);
+}
+
+template <typename T>
+static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x,
+                                       const Tensor& y, const Tensor& mask,
+                                       T mf) {
+  auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
+  auto x_t = EigenVector<T>::Flatten(x);
+  auto y_t = EigenVector<T>::Flatten(y);
+  auto mask_t = EigenVector<int>::Flatten(mask);
+
+  for (int i = 0; i < x_t.dimensions()[0]; i++) {
+    if (mask_t(i)) {
+      grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf;
+    }
+  }
+}
+
+template <typename T>
+static void CalcPredResult(const Tensor& input, Tensor* pred_conf,
+                           Tensor* pred_class, Tensor* pred_x, Tensor* pred_y,
+                           Tensor* pred_w, Tensor* pred_h, const int anchor_num,
+                           const int class_num) {
+  const int n = input.dims()[0];
+  const int h = input.dims()[2];
+  const int w = input.dims()[3];
+  const int box_attr_num = 5 + class_num;
+
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto pred_conf_t = EigenTensor<T, 4>::From(*pred_conf);
+  auto pred_class_t = EigenTensor<T, 5>::From(*pred_class);
+  auto pred_x_t = EigenTensor<T, 4>::From(*pred_x);
+  auto pred_y_t = EigenTensor<T, 4>::From(*pred_y);
+  auto pred_w_t = EigenTensor<T, 4>::From(*pred_w);
+  auto pred_h_t = EigenTensor<T, 4>::From(*pred_h);
+
+  for (int i = 0; i < n; i++) {
+    for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
+      for (int j = 0; j < h; j++) {
+        for (int k = 0; k < w; k++) {
+          pred_x_t(i, an_idx, j, k) =
+              sigmoid(input_t(i, box_attr_num * an_idx, j, k));
+          pred_y_t(i, an_idx, j, k) =
+              sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k));
+          pred_w_t(i, an_idx, j, k) =
+              input_t(i, box_attr_num * an_idx + 2, j, k);
+          pred_h_t(i, an_idx, j, k) =
+              input_t(i, box_attr_num * an_idx + 3, j, k);
+
+          pred_conf_t(i, an_idx, j, k) =
+              sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k));
+
+          for (int c = 0; c < class_num; c++) {
+            pred_class_t(i, an_idx, j, k, c) =
+                sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static T CalcBoxIoU(std::vector<T> box1, std::vector<T> box2) {
+  T b1_x1 = box1[0] - box1[2] / 2;
+  T b1_x2 = box1[0] + box1[2] / 2;
+  T b1_y1 = box1[1] - box1[3] / 2;
+  T b1_y2 = box1[1] + box1[3] / 2;
+  T b2_x1 = box2[0] - box2[2] / 2;
+  T b2_x2 = box2[0] + box2[2] / 2;
+  T b2_y1 = box2[1] - box2[3] / 2;
+  T b2_y2 = box2[1] + box2[3] / 2;
+
+  T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1);
+  T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1);
+
+  T inter_rect_x1 = std::max(b1_x1, b2_x1);
+  T inter_rect_y1 = std::max(b1_y1, b2_y1);
+  T inter_rect_x2 = std::min(b1_x2, b2_x2);
+  T inter_rect_y2 = std::min(b1_y2, b2_y2);
+  T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast<T>(0.0)) *
+                 std::max(inter_rect_y2 - inter_rect_y1, static_cast<T>(0.0));
+
+  return inter_area / (b1_area + b2_area - inter_area);
+}
+
+template <typename T>
+static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
+                            const float ignore_thresh, std::vector<int> anchors,
+                            const int grid_size, Tensor* obj_mask,
+                            Tensor* noobj_mask, Tensor* tx, Tensor* ty,
+                            Tensor* tw, Tensor* th, Tensor* tconf,
+                            Tensor* tclass) {
+  const int n = gt_box.dims()[0];
+  const int b = gt_box.dims()[1];
+  const int anchor_num = anchors.size() / 2;
+  auto gt_box_t = EigenTensor<T, 3>::From(gt_box);
+  auto gt_label_t = EigenTensor<int, 2>::From(gt_label);
+  auto obj_mask_t = EigenTensor<int, 4>::From(*obj_mask).setConstant(0);
+  auto noobj_mask_t = EigenTensor<int, 4>::From(*noobj_mask).setConstant(1);
+  auto tx_t = EigenTensor<T, 4>::From(*tx).setConstant(0.0);
+  auto ty_t = EigenTensor<T, 4>::From(*ty).setConstant(0.0);
+  auto tw_t = EigenTensor<T, 4>::From(*tw).setConstant(0.0);
+  auto th_t = EigenTensor<T, 4>::From(*th).setConstant(0.0);
+  auto tconf_t = EigenTensor<T, 4>::From(*tconf).setConstant(0.0);
+  auto tclass_t = EigenTensor<T, 5>::From(*tclass).setConstant(0.0);
+
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < b; j++) {
+      if (isZero<T>(gt_box_t(i, j, 0)) && isZero<T>(gt_box_t(i, j, 1)) &&
+          isZero<T>(gt_box_t(i, j, 2)) && isZero<T>(gt_box_t(i, j, 3))) {
+        continue;
+      }
+
+      int cur_label = gt_label_t(i, j);
+      T gx = gt_box_t(i, j, 0) * grid_size;
+      T gy = gt_box_t(i, j, 1) * grid_size;
+      T gw = gt_box_t(i, j, 2) * grid_size;
+      T gh = gt_box_t(i, j, 3) * grid_size;
+      int gi = static_cast<int>(gx);
+      int gj = static_cast<int>(gy);
+
+      T max_iou = static_cast<T>(0);
+      T iou;
+      int best_an_index = -1;
+      std::vector<T> gt_box_shape({0, 0, gw, gh});
+      for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
+        std::vector<T> anchor_shape({0, 0, static_cast<T>(anchors[2 * an_idx]),
+                                     static_cast<T>(anchors[2 * an_idx + 1])});
+        iou = CalcBoxIoU<T>(gt_box_shape, anchor_shape);
+        if (iou > max_iou) {
+          max_iou = iou;
+          best_an_index = an_idx;
+        }
+        if (iou > ignore_thresh) {
+          noobj_mask_t(i, an_idx, gj, gi) = 0;
+        }
+      }
+      obj_mask_t(i, best_an_index, gj, gi) = 1;
+      noobj_mask_t(i, best_an_index, gj, gi) = 0;
+      tx_t(i, best_an_index, gj, gi) = gx - gi;
+      ty_t(i, best_an_index, gj, gi) = gy - gj;
+      tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]);
+      th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]);
+      tclass_t(i, best_an_index, gj, gi, cur_label) = 1;
+      tconf_t(i, best_an_index, gj, gi) = 1;
+    }
+  }
+}
+
+static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand,
+                                    const Tensor& obj_mask) {
+  const int n = obj_mask_expand->dims()[0];
+  const int an_num = obj_mask_expand->dims()[1];
+  const int h = obj_mask_expand->dims()[2];
+  const int w = obj_mask_expand->dims()[3];
+  const int class_num = obj_mask_expand->dims()[4];
+  auto obj_mask_expand_t = EigenTensor<int, 5>::From(*obj_mask_expand);
+  auto obj_mask_t = EigenTensor<int, 4>::From(obj_mask);
+
+  obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1))
+                          .broadcast(Array5(1, 1, 1, 1, class_num));
+}
+
+template <typename T>
+static void AddAllGradToInputGrad(
+    Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y,
+    const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x,
+    const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h,
+    const Tensor& grad_conf_target, const Tensor& grad_conf_notarget,
+    const Tensor& grad_class, const int class_num, const float loss_weight_xy,
+    const float loss_weight_wh, const float loss_weight_conf_target,
+    const float loss_weight_conf_notarget, const float loss_weight_class) {
+  const int n = pred_x.dims()[0];
+  const int an_num = pred_x.dims()[1];
+  const int h = pred_x.dims()[2];
+  const int w = pred_x.dims()[3];
+  const int attr_num = class_num + 5;
+  auto grad_t = EigenTensor<T, 4>::From(*grad).setConstant(0.0);
+  auto pred_x_t = EigenTensor<T, 4>::From(pred_x);
+  auto pred_y_t = EigenTensor<T, 4>::From(pred_y);
+  auto pred_conf_t = EigenTensor<T, 4>::From(pred_conf);
+  auto pred_class_t = EigenTensor<T, 5>::From(pred_class);
+  auto grad_x_t = EigenTensor<T, 4>::From(grad_x);
+  auto grad_y_t = EigenTensor<T, 4>::From(grad_y);
+  auto grad_w_t = EigenTensor<T, 4>::From(grad_w);
+  auto grad_h_t = EigenTensor<T, 4>::From(grad_h);
+  auto grad_conf_target_t = EigenTensor<T, 4>::From(grad_conf_target);
+  auto grad_conf_notarget_t = EigenTensor<T, 4>::From(grad_conf_notarget);
+  auto grad_class_t = EigenTensor<T, 5>::From(grad_class);
+
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          grad_t(i, j * attr_num, k, l) =
+              grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) *
+              (1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy;
+          grad_t(i, j * attr_num + 1, k, l) =
+              grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) *
+              (1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy;
+          grad_t(i, j * attr_num + 2, k, l) =
+              grad_w_t(i, j, k, l) * loss * loss_weight_wh;
+          grad_t(i, j * attr_num + 3, k, l) =
+              grad_h_t(i, j, k, l) * loss * loss_weight_wh;
+          grad_t(i, j * attr_num + 4, k, l) =
+              grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
+              (1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target;
+          grad_t(i, j * attr_num + 4, k, l) +=
+              grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
+              (1.0 - pred_conf_t(i, j, k, l)) * loss *
+              loss_weight_conf_notarget;
+
+          for (int c = 0; c < class_num; c++) {
+            grad_t(i, j * attr_num + 5 + c, k, l) =
+                grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) *
+                (1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+class Yolov3LossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* gt_box = ctx.Input<Tensor>("GTBox");
+    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* loss = ctx.Output<Tensor>("Loss");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
+    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
+    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
+    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
+    float loss_weight_conf_notarget =
+        ctx.Attr<float>("loss_weight_conf_notarget");
+    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int an_num = anchors.size() / 2;
+
+    Tensor pred_x, pred_y, pred_w, pred_h;
+    Tensor pred_conf, pred_class;
+    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
+    CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
+                      &pred_w, &pred_h, an_num, class_num);
+
+    Tensor obj_mask, noobj_mask;
+    Tensor tx, ty, tw, th, tconf, tclass;
+    obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
+    noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
+    tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
+    PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
+                       &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
+
+    Tensor obj_mask_expand;
+    obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
+                                      ctx.GetPlace());
+    ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
+
+    T loss_x = CalcMSEWithMask<T>(pred_x, tx, obj_mask);
+    T loss_y = CalcMSEWithMask<T>(pred_y, ty, obj_mask);
+    T loss_w = CalcMSEWithMask<T>(pred_w, tw, obj_mask);
+    T loss_h = CalcMSEWithMask<T>(pred_h, th, obj_mask);
+    T loss_conf_target = CalcBCEWithMask<T>(pred_conf, tconf, obj_mask);
+    T loss_conf_notarget = CalcBCEWithMask<T>(pred_conf, tconf, noobj_mask);
+    T loss_class = CalcBCEWithMask<T>(pred_class, tclass, obj_mask_expand);
+
+    auto* loss_data = loss->mutable_data<T>({1}, ctx.GetPlace());
+    loss_data[0] = loss_weight_xy * (loss_x + loss_y) +
+                   loss_weight_wh * (loss_w + loss_h) +
+                   loss_weight_conf_target * loss_conf_target +
+                   loss_weight_conf_notarget * loss_conf_notarget +
+                   loss_weight_class * loss_class;
+  }
+};
+
+template <typename T>
+class Yolov3LossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* gt_box = ctx.Input<Tensor>("GTBox");
+    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    const T loss = output_grad->data<T>()[0];
+    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
+    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
+    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
+    float loss_weight_conf_notarget =
+        ctx.Attr<float>("loss_weight_conf_notarget");
+    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
+
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int an_num = anchors.size() / 2;
+
+    Tensor pred_x, pred_y, pred_w, pred_h;
+    Tensor pred_conf, pred_class;
+    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
+    CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
+                      &pred_w, &pred_h, an_num, class_num);
+
+    Tensor obj_mask, noobj_mask;
+    Tensor tx, ty, tw, th, tconf, tclass;
+    obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
+    noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
+    tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
+    PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
+                       &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
+
+    Tensor obj_mask_expand;
+    obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
+                                      ctx.GetPlace());
+    ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
+
+    Tensor grad_x, grad_y, grad_w, grad_h;
+    Tensor grad_conf_target, grad_conf_notarget, grad_class;
+    grad_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_conf_target.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_conf_notarget.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
+    T obj_mf = CalcMaskPointNum<int>(obj_mask);
+    T noobj_mf = CalcMaskPointNum<int>(noobj_mask);
+    T obj_expand_mf = CalcMaskPointNum<int>(obj_mask_expand);
+    CalcMSEGradWithMask<T>(&grad_x, pred_x, tx, obj_mask, obj_mf);
+    CalcMSEGradWithMask<T>(&grad_y, pred_y, ty, obj_mask, obj_mf);
+    CalcMSEGradWithMask<T>(&grad_w, pred_w, tw, obj_mask, obj_mf);
+    CalcMSEGradWithMask<T>(&grad_h, pred_h, th, obj_mask, obj_mf);
+    CalcBCEGradWithMask<T>(&grad_conf_target, pred_conf, tconf, obj_mask,
+                           obj_mf);
+    CalcBCEGradWithMask<T>(&grad_conf_notarget, pred_conf, tconf, noobj_mask,
+                           noobj_mf);
+    CalcBCEGradWithMask<T>(&grad_class, pred_class, tclass, obj_mask_expand,
+                           obj_expand_mf);
+
+    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    AddAllGradToInputGrad<T>(
+        input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y,
+        grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class,
+        class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target,
+        loss_weight_conf_notarget, loss_weight_class);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h
index 2ce9b31bb8..2e8fa7c1b8 100644
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -36,6 +36,15 @@ limitations under the License. */
       asm("trap;");                                                     \
     }                                                                   \
   } while (0)
+
+#define PADDLE_ASSERT_MSG_CODE(e, m, c)                                    \
+  do {                                                                     \
+    if (!(e)) {                                                            \
+      printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \
+             TOSTRING(e), m, c);                                           \
+      asm("trap;");                                                        \
+    }                                                                      \
+  } while (0)
 #else
 #include <assert.h>
 // For cuda, the assertions can affect performance and it is therefore
@@ -43,4 +52,5 @@ limitations under the License. */
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
 #define PADDLE_ASSERT(e) assert((e))
 #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
+#define PADDLE_ASSERT_MSG_CODE(e, m, c) assert((e) && (m) && (c || 1))
 #endif
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 682b0c0ff3..61a25064d1 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -62,7 +62,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 
 #define CUDNN_ENFORCE(condition)                                     \
   do {                                                               \
-    cudnnStatus_t status = condition;                                \
+    auto status = condition;                                         \
     if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) {                  \
       PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
     }                                                                \
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 86f18b7a80..0a4563ead6 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -203,7 +203,7 @@ class DeviceTracerImpl : public DeviceTracer {
   void AddCPURecords(const std::string &anno, uint64_t start_ns,
                      uint64_t end_ns, int64_t device_id, int64_t thread_id) {
     if (anno.empty()) {
-      VLOG(10) << "Empty timeline annotation.";
+      VLOG(1) << "Empty timeline annotation.";
       return;
     }
     std::lock_guard<std::mutex> l(trace_mu_);
@@ -216,7 +216,7 @@ class DeviceTracerImpl : public DeviceTracer {
                      uint32_t correlation_id, uint64_t bytes) {
     // 0 means timestamp information could not be collected for the kernel.
     if (start_ns == 0 || end_ns == 0) {
-      VLOG(30) << name << " cannot be traced";
+      VLOG(3) << name << " cannot be traced";
       return;
     }
     std::lock_guard<std::mutex> l(trace_mu_);
@@ -229,7 +229,7 @@ class DeviceTracerImpl : public DeviceTracer {
                         uint32_t correlation_id) {
     // 0 means timestamp information could not be collected for the kernel.
     if (start == 0 || end == 0) {
-      VLOG(30) << correlation_id << " cannot be traced";
+      VLOG(3) << correlation_id << " cannot be traced";
       return;
     }
     std::lock_guard<std::mutex> l(trace_mu_);
@@ -348,7 +348,7 @@ class DeviceTracerImpl : public DeviceTracer {
         tracer->AddAnnotation(cbInfo->correlationId, anno);
       }
     } else {
-      VLOG(10) << "Unhandled API Callback for " << domain << " " << cbid;
+      VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
     }
   }
   CUpti_SubscriberHandle subscriber_;
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 1a83ac7780..213cd8a9ce 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -48,13 +48,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 
 #else
 
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)     \
-  struct DynLoad__##__name {                        \
-    template <typename... Args>                     \
-    inline cudnnStatus_t operator()(Args... args) { \
-      return ::__name(args...);                     \
-    }                                               \
-  };                                                \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
+  struct DynLoad__##__name {                    \
+    template <typename... Args>                 \
+    inline auto operator()(Args... args) {      \
+      return ::__name(args...);                 \
+    }                                           \
+  };                                            \
   extern DynLoad__##__name __name
 
 #endif
@@ -111,7 +111,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
   __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
   __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
-  __macro(cudnnGetErrorString);
+  __macro(cudnnGetErrorString);                           \
+  __macro(cudnnCreateDropoutDescriptor);                  \
+  __macro(cudnnDropoutGetStatesSize);                     \
+  __macro(cudnnSetDropoutDescriptor);                     \
+  __macro(cudnnCreateRNNDescriptor);                      \
+  __macro(cudnnSetRNNDescriptor);                         \
+  __macro(cudnnGetRNNParamsSize);                         \
+  __macro(cudnnGetRNNWorkspaceSize);                      \
+  __macro(cudnnGetRNNTrainingReserveSize);                \
+  __macro(cudnnRNNForwardTraining);                       \
+  __macro(cudnnRNNBackwardData);                          \
+  __macro(cudnnRNNBackwardWeights);                       \
+  __macro(cudnnRNNForwardInference);                      \
+  __macro(cudnnDestroyDropoutDescriptor);                 \
+  __macro(cudnnDestroyRNNDescriptor);                     \
+  __macro(cudnnSetRNNDescriptor_v6);
+
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 #define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index d53907b749..cc5cda6106 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -72,8 +72,8 @@ static inline std::string join(const std::string& part1,
 
 static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
                                                 int dynload_flags) {
-  VLOG(30) << "Try to find library: " << dso_path
-           << " from default system path.";
+  VLOG(3) << "Try to find library: " << dso_path
+          << " from default system path.";
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
   // and /usr/local/lib path
   void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 9273e9b1e7..f0a9736623 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -68,6 +68,8 @@ extern void* mklml_dso_handle;
   __macro(cblas_dgemm_batch);       \
   __macro(cblas_sdot);              \
   __macro(cblas_ddot);              \
+  __macro(cblas_sasum);             \
+  __macro(cblas_dasum);             \
   __macro(cblas_sscal);             \
   __macro(cblas_dscal);             \
   __macro(vsAdd);                   \
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index ee16fc66e4..9d48557caf 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -1039,6 +1039,11 @@ HOSTDEVICE inline float16 exp(const float16& a) {
   return float16(::expf(static_cast<float>(a)));
 }
 
+template <>
+HOSTDEVICE inline float16 erf(const float16& a) {
+  return float16(::erff(static_cast<float>(a)));
+}
+
 template <>
 HOSTDEVICE inline float16 log(const float16& a) {
   return float16(::logf(static_cast<float>(a)));
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 833d48347f..6954e4c6a9 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -19,7 +19,16 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 
-DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+#ifndef _WIN32
+constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
+#else
+// fraction_of_gpu_memory_to_use cannot be too high on windows,
+// since the win32 graphic sub-system can occupy some GPU memory
+// which may lead to insufficient memory left for paddle
+constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
+#endif
+
+DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
               "Allocate a trunk of gpu memory that is this fraction of the "
               "total gpu memory size. Future memory usage will be allocated "
               "from the trunk. If the trunk doesn't have enough gpu memory, "
@@ -144,8 +153,8 @@ size_t GpuMaxChunkSize() {
   size_t available = 0;
 
   GpuMemoryUsage(&available, &total);
-  VLOG(100) << "GPU Usage " << available / 1024 / 1024 << "M/"
-            << total / 1024 / 1024 << "M";
+  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
+           << total / 1024 / 1024 << "M";
   size_t reserving = static_cast<size_t>(0.05 * total);
   // If available less than minimum chunk size, no usable memory exists.
   available =
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 0ccef6c6a8..258779ba51 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -49,7 +49,7 @@ void InitGflags(std::vector<std::string> argv) {
       line += ' ';
     }
     google::ParseCommandLineFlags(&argc, &arr, true);
-    VLOG(10) << "Init commandline: " << line;
+    VLOG(1) << "Init commandline: " << line;
   });
 }
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 814012e6c1..167bd4e81d 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkldnn.h>
+#include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
@@ -106,170 +107,6 @@ inline mkldnn::memory::format GetMKLDNNFormat(
       memory.dst_primitive_desc().desc().data.format);
 }
 
-class MKLDNNHandler {
- public:
-  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-                const std::string& base_key)
-      : dev_ctx_(dev_ctx),
-        engine_(engine),
-        key_(base_key),
-        is_reusing_(false) {}
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
-      mkldnn::memory::primitive_desc mdp, void* ptr,
-      const std::string& suffix) {
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find mem primitive in device context");
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(const mkldnn::memory::desc& md,
-                                                void* ptr,
-                                                const std::string& suffix) {
-    /*Generate key*/
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find mem primitive in device context");
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      const std::shared_ptr<mkldnn::memory>& user_memory_p,
-      const std::shared_ptr<mkldnn::memory>& target_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto local_key = key_ + suffix;
-    auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto stored_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-        dev_ctx_.GetBlob(key_reorder_p));
-
-    if (stored_reorder_p) {
-      pipeline.push_back(*stored_reorder_p);
-    } else {
-      auto reorder_p =
-          std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
-      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-      pipeline.push_back(*reorder_p);
-    }
-
-    return target_memory_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      mkldnn::memory::primitive_desc& mpd,       // NOLINT
-      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false) {
-    // create reorder primitive if the input format is not the preferred one
-    auto local_key = key_ + suffix;
-    auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto target_memory_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find mem primitive in device context");
-    if (target_memory_p == nullptr) {
-      target_memory_p = user_memory_p;
-      std::shared_ptr<mkldnn::primitive> reorder_p;
-      if (mpd != user_mpd) {
-        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
-
-        auto reorder_p =
-            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
-        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-        pipeline.push_back(*reorder_p);
-      }
-      dev_ctx_.SetBlob(local_key, target_memory_p);
-    } else if (!is_persistent) {
-      // Make reorder if needed
-      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-          dev_ctx_.GetBlob(key_reorder_p));
-      if (reorder_p != nullptr) {
-        pipeline.push_back(*reorder_p);
-      }
-      is_reusing_ = true;
-    }
-    return target_memory_p;
-  }
-
-  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
-                             const std::string& suffix) {
-    return dims2str(operand_dims) + suffix;
-  }
-
- protected:
-  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
-    std::string dstr = "";
-    for (size_t i = 0; i < operand_dims.size(); ++i) {
-      dstr += std::to_string(operand_dims[i]) + "-";
-    }
-    return dstr;
-  }
-
- protected:
-  const MKLDNNDeviceContext& dev_ctx_;
-  mkldnn::engine engine_;
-  std::string key_;
-  bool is_reusing_;
-};
-
 inline mkldnn::memory::format MKLDNNFormatForSize(
     size_t dims_size, mkldnn::memory::format data_format) {
   if (dims_size == 1) {
@@ -292,5 +129,21 @@ inline mkldnn::memory::format data_format_to_memory_format(
   }
 }
 
+inline mkldnn::memory::format StringToMKLDNNFormat(std::string* format) {
+  std::transform(format->begin(), format->end(), format->begin(), ::tolower);
+
+  if (!format->compare("nchw")) {
+    return mkldnn::memory::format::nchw;
+  } else if (!format->compare("nchw16c")) {
+    return mkldnn::memory::format::nChw16c;
+  } else if (!format->compare("nchw8c")) {
+    return mkldnn::memory::format::nChw8c;
+  } else if (!format->compare("nhwc")) {
+    return mkldnn::memory::format::nhwc;
+  } else {
+    return mkldnn::memory::format::any;
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
new file mode 100644
index 0000000000..1c6421f3fa
--- /dev/null
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -0,0 +1,458 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+using user_function = std::function<std::shared_ptr<float>(const float*)>;
+
+class MKLDNNHandler {
+ public:
+  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+                const std::string& base_key)
+      : dev_ctx_(dev_ctx),
+        engine_(engine),
+        key_(base_key),
+        is_reusing_(false) {}
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
+      const mkldnn::memory::desc& md, void* ptr,
+      user_function custom_func = {}) {
+    return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::primitive_desc mdp, void* ptr,
+      const std::string& suffix) {
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  // This incarnation of AcquireMemory can call user function eg. custom reorder
+  // or preprocessing routine if needed
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      const mkldnn::memory::desc& md, void* ptr, const std::string& suffix,
+      user_function custom_func = {}) {
+    /*Generate key*/
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      // Call custom reorder/preprocessing func if available
+      if (custom_func) {
+        auto reordered_data = custom_func(reinterpret_cast<const float*>(ptr));
+        dev_ctx_.SetBlob(local_key + "-custom_reorder", reordered_data);
+        ptr = reinterpret_cast<void*>(reordered_data.get());
+      }
+
+      mem_p = std::make_shared<mkldnn::memory>(
+          mkldnn::memory::primitive_desc{md, engine_}, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      const std::shared_ptr<mkldnn::memory>& user_memory_p,
+      const std::shared_ptr<mkldnn::memory>& target_memory_p,
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto local_key = key_ + suffix;
+    auto key_reorder_p = key_ + suffix + "reorder_p";
+
+    auto stored_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+        dev_ctx_.GetBlob(key_reorder_p));
+
+    if (stored_reorder_p) {
+      pipeline.push_back(*stored_reorder_p);
+    } else {
+      auto reorder_p =
+          std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+      pipeline.push_back(*reorder_p);
+    }
+
+    return target_memory_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      mkldnn::memory::primitive_desc& mpd,       // NOLINT
+      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
+    // create reorder primitive if the input format is not the preferred one
+    auto local_key = key_ + suffix;
+    auto key_reorder_p = key_ + suffix + "reorder_p";
+
+    auto target_memory_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (target_memory_p == nullptr) {
+      target_memory_p = user_memory_p;
+      std::shared_ptr<mkldnn::primitive> reorder_p;
+      if (mpd != user_mpd) {
+        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
+        auto reorder_p =
+            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+        pipeline.push_back(*reorder_p);
+      }
+      dev_ctx_.SetBlob(local_key, target_memory_p);
+    } else if (!is_persistent) {
+      // Make reorder if needed
+      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+          dev_ctx_.GetBlob(key_reorder_p));
+      if (reorder_p != nullptr) {
+        pipeline.push_back(*reorder_p);
+      }
+      is_reusing_ = true;
+    }
+    return target_memory_p;
+  }
+
+  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
+                             const std::string& suffix) {
+    return dims2str(operand_dims) + suffix;
+  }
+
+ protected:
+  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  }
+
+ protected:
+  const MKLDNNDeviceContext& dev_ctx_;
+  mkldnn::engine engine_;
+  std::string key_;
+  bool is_reusing_;
+};
+
+template <class forward_t, class backward_data_t, class backward_weights_t>
+class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
+ public:
+  ConvMKLDNNTemplateHandler(
+      std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
+    conv_pd_ = conv_pd;
+  }
+
+  ConvMKLDNNTemplateHandler(
+      std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
+      std::shared_ptr<typename backward_data_t::primitive_desc>
+          conv_bwd_data_pd,
+      std::shared_ptr<typename backward_weights_t::primitive_desc>
+          conv_bwd_weights_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        conv_pd_(conv_pd),
+        conv_bwd_weights_pd_(conv_bwd_weights_pd),
+        conv_bwd_data_pd_(conv_bwd_data_pd) {
+    // If we are in Grad operatgor then update a key with BWD suffix to
+    // distinguish from FWD memory primitives
+    key_ += "-BWD";
+  }
+
+  size_t GetDstMemorySize() const {
+    return conv_pd_->dst_primitive_desc().get_size();
+  }
+
+  mkldnn::memory::format GetDstFormat() const {
+    return static_cast<mkldnn::memory::format>(
+        conv_pd_->dst_primitive_desc().desc().data.format);
+  }
+
+  size_t GetDiffWeightsMemorySize() const {
+    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
+  }
+
+  size_t GetDiffSourceMemorySize() const {
+    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p,
+                               "@weights-src_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
+                               "@weights-diff_dst_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
+      void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
+        "@diff_weights_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
+                               "@data-diff_dst_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
+    auto user_pd = user_weights_memory_p->get_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
+                               "@data-weights_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
+      const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
+      void* dst_ptr,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    return this->AcquireMemory(user_residual_memory_p,
+                               this->AcquireDstMemoryFromPrimitive(dst_ptr),
+                               "@residual_data_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
+      void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
+    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
+                                            "@dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto src_pd = conv_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
+                               pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
+    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
+    auto weights_pd = conv_pd_->weights_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_weights_pd,
+                               user_weights_memory_p, "@weights_mem_p",
+                               pipeline, is_persistent);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
+    auto bias_pd = conv_pd_->bias_primitive_desc();
+    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
+                               "@bias_mem_p", pipeline);
+  }
+
+  std::shared_ptr<forward_t> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p =
+        std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<forward_t>(*conv_pd_, *(src_memory_p),
+                                           *(weights_memory_p.get()),
+                                           *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
+  std::shared_ptr<forward_t> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> bias_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p =
+        std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<forward_t>(
+          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
+          *(bias_memory_p.get()), *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
+  std::shared_ptr<backward_weights_t> AcquireConvolutionBackwardWeights(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
+    auto prim_key = key_ + "@conv_bwd_weights_p";
+    auto conv_bwd_weights_p = std::static_pointer_cast<backward_weights_t>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd weights primitive in device context");
+    if (conv_bwd_weights_p == nullptr) {
+      // create backward conv primitive for weights
+      conv_bwd_weights_p = std::make_shared<backward_weights_t>(
+          *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
+          *diff_weights_memory_p);
+      dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_bwd_weights_p;
+  }
+
+  std::shared_ptr<backward_data_t> AcquireConvolutionBackwardData(
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
+    auto prim_key = key_ + "@conv_bwd_data_p";
+    auto conv_bwd_data_p =
+        std::static_pointer_cast<backward_data_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_data_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd data primitive in device context");
+    if (conv_bwd_data_p == nullptr) {
+      conv_bwd_data_p = std::make_shared<backward_data_t>(
+          *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
+          *diff_src_memory_p);
+      dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_bwd_data_p;
+  }
+
+  // Generate keys for storing/retriving primitives for this operator
+  // TODO(jczaja): Make hashing function more optimial
+  static std::string GetHash(mkldnn::memory::dims& input_dims,    // NOLINT
+                             mkldnn::memory::dims& weights_dims,  // NOLINT
+                             std::vector<int>& strides,           // NOLINT
+                             std::vector<int>& paddings,          // NOLINT
+                             std::vector<int>& dilations,         // NOLINT
+                             int groups, const std::string& suffix) {
+    return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
+           dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
+           suffix;
+  }
+
+ private:
+  std::shared_ptr<typename forward_t::primitive_desc> conv_pd_;
+  std::shared_ptr<typename backward_weights_t::primitive_desc>
+      conv_bwd_weights_pd_;
+  std::shared_ptr<typename backward_data_t::primitive_desc> conv_bwd_data_pd_;
+};
+
+using ConvMKLDNNHandler =
+    ConvMKLDNNTemplateHandler<mkldnn::convolution_forward,
+                              mkldnn::convolution_backward_data,
+                              mkldnn::convolution_backward_weights>;
+
+using ConvTransposeMKLDNNHandler =
+    ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward,
+                              mkldnn::deconvolution_backward_data,
+                              mkldnn::deconvolution_backward_weights>;
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index a6360a884d..fc903b548c 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -113,7 +113,7 @@ struct NCCLContextMap {
         NCCLGroupGuard gurad;
         for (auto &gpu_id : order_) {
           int rank = trainer_id * order_.size() + gpu_id;
-          VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks;
+          VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks;
           PADDLE_ENFORCE(cudaSetDevice(gpu_id));
           PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
               comms.get() + gpu_id, nranks, *nccl_id, rank));
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 11c68f3449..ed8734c98c 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
+#include <ThreadPool.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <functional>
 #include <memory>
-#include "ThreadPool.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 25d241d976..d602613fc8 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,6 +1,6 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc)
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc
new file mode 100644
index 0000000000..470e8b0508
--- /dev/null
+++ b/paddle/fluid/pybind/async_executor_py.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+// To avoid conflicting definition in gcc-4.8.2 headers and pyconfig.h (2.7.3)
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+#include <string>
+#include <vector>
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "paddle/fluid/framework/async_executor.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/variant.h"
+#include "paddle/fluid/pybind/async_executor_py.h"
+
+namespace py = pybind11;
+namespace pd = paddle::framework;
+
+namespace paddle {
+namespace pybind {
+using set_name_func = void (pd::DataFeedDesc::*)(const std::string&);
+void BindAsyncExecutor(py::module* m) {
+  py::class_<framework::AsyncExecutor>(*m, "AsyncExecutor")
+      .def(py::init([](framework::Scope* scope, const platform::Place& place) {
+        return std::unique_ptr<framework::AsyncExecutor>(
+            new framework::AsyncExecutor(scope, place));
+      }))
+      .def("run_from_files", &framework::AsyncExecutor::RunFromFile);
+}  // end BindAsyncExecutor
+}  // end namespace pybind
+}  // end namespace paddle
diff --git a/paddle/fluid/pybind/async_executor_py.h b/paddle/fluid/pybind/async_executor_py.h
new file mode 100644
index 0000000000..a99d6e0421
--- /dev/null
+++ b/paddle/fluid/pybind/async_executor_py.h
@@ -0,0 +1,28 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindAsyncExecutor(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 586e92c2b3..ac406b27b5 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -29,12 +29,21 @@ limitations under the License. */
 namespace pybind11 {
 namespace detail {
 
+#if !defined(PYBIND11_HIDDEN)
+#ifdef _WIN32
+#define PYBIND11_HIDDEN __declspec(dllexport)
+#else
+#define PYBIND11_HIDDEN __attribute__((visibility("hidden")))
+#endif
+#endif
+
 // Can be replaced by a generic lambda in C++14
-struct variant_caster_visitor : public boost::static_visitor<handle> {
+struct PYBIND11_HIDDEN paddle_variant_caster_visitor
+    : public boost::static_visitor<handle> {
   return_value_policy policy;
   handle parent;
 
-  variant_caster_visitor(return_value_policy policy, handle parent)
+  paddle_variant_caster_visitor(return_value_policy policy, handle parent)
       : policy(policy), parent(parent) {}
 
   template <class T>
@@ -44,10 +53,10 @@ struct variant_caster_visitor : public boost::static_visitor<handle> {
 };
 
 template <class Variant>
-struct variant_caster;
+struct paddle_variant_caster;
 
 template <template <class...> class V, class... Ts>
-struct variant_caster<V<Ts...>> {
+struct paddle_variant_caster<V<Ts...>> {
   using Type = V<Ts...>;
 
   template <typename T>
@@ -61,9 +70,9 @@ struct variant_caster<V<Ts...>> {
       if (std::is_same<T, std::vector<float>>::value) {
         auto caster_ints = make_caster<std::vector<int64_t>>();
         if (caster_ints.load(src, convert)) {
-          VLOG(40) << "This value are floats and int64_ts satisfy "
-                      "simultaneously, will set it's type to "
-                      "std::vector<int64_t>";
+          VLOG(4) << "This value are floats and int64_ts satisfy "
+                     "simultaneously, will set it's type to "
+                     "std::vector<int64_t>";
           value = cast_op<std::vector<int64_t>>(caster_ints);
           return true;
         }
@@ -90,7 +99,7 @@ struct variant_caster<V<Ts...>> {
 
   static handle cast(Type const &src, return_value_policy policy,
                      handle parent) {
-    variant_caster_visitor visitor(policy, parent);
+    paddle_variant_caster_visitor visitor(policy, parent);
     return boost::apply_visitor(visitor, src);
   }
 
@@ -101,7 +110,7 @@ struct variant_caster<V<Ts...>> {
 // Add specialization for concrete variant type
 template <class... Args>
 struct type_caster<boost::variant<Args...>>
-    : variant_caster<boost::variant<Args...>> {};
+    : paddle_variant_caster<boost::variant<Args...>> {};
 
 }  // namespace detail
 }  // namespace pybind11
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 795800fd51..fc7991d297 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -42,6 +42,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/pybind/async_executor_py.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/protobuf.h"
@@ -86,12 +87,12 @@ bool IsCompiledWithDIST() {
 #endif
 }
 
-PYBIND11_PLUGIN(core) {
+PYBIND11_MODULE(core, m) {
   // Not used, just make sure cpu_info.cc is linked.
   paddle::platform::CpuTotalPhysicalMemory();
 
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
-  py::module m("core", "C++ core of PaddlePaddle");
+  m.doc() = "C++ core of PaddlePaddle";
 
   // using framework in this function. Since it is inside a function, it will
   // not cause namespace pollution.
@@ -398,7 +399,26 @@ All parameter, weight, gradient are variables in Paddle.
             },
         py::return_value_policy::copy);
 
-  py::class_<Scope>(m, "Scope", "")
+  py::class_<Scope>(m, "Scope", R"DOC(
+    Scope is an association of a name to Variable. All variables belong to Scope.
+
+    Variables in a parent scope can be retrieved from local scope.
+
+    You need to specify a scope to run a Net, i.e., `exe.Run(&scope)`.
+    One net can run in different scopes and update different variable in the
+    scope.
+
+    You can create var in a scope and get it from the scope.
+
+    Examples:
+        .. code-block:: python
+
+          # create tensor from a scope and set value to it.
+          param = scope.var('Param').get_tensor()
+          param_array = np.full((height, row_numel), 5.0).astype("float32")
+          param.set(param_array, place)
+
+        )DOC")
       .def("var",
            [](Scope &self, const std::string &name) -> Variable * {
              return self.Var(name);
@@ -860,6 +880,12 @@ All parameter, weight, gradient are variables in Paddle.
             self.remove_unnecessary_lock_ = b;
           },
           R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC")
+      .def_property(
+          "num_trainers",
+          [](const BuildStrategy &self) { return self.num_trainers_; },
+          [](BuildStrategy &self, int num_trainers) {
+            self.num_trainers_ = num_trainers;
+          })
       .def_property(
           "fuse_elewise_add_act_ops",
           [](const BuildStrategy &self) {
@@ -907,7 +933,7 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   BindRecordIOWriter(&m);
-  return m.ptr();
+  BindAsyncExecutor(&m);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index b39323f843..02a75236f6 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
-#include "pybind11/common.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index ac1ac8e7c2..a0757b53f3 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
 
 std::unique_ptr<paddle::framework::ProgramDesc> Load(
     paddle::framework::Executor* executor, const std::string& model_filename) {
-  VLOG(30) << "loading model from " << model_filename;
+  VLOG(3) << "loading model from " << model_filename;
   std::string program_desc_str;
   ReadBinaryFile(model_filename, &program_desc_str);
 
diff --git a/paddle/legacy/cuda/include/hl_warpctc_wrap.h b/paddle/legacy/cuda/include/hl_warpctc_wrap.h
index 0857bd1aa1..09cbd6d450 100644
--- a/paddle/legacy/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/legacy/cuda/include/hl_warpctc_wrap.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifndef _WIN32
 #ifndef HL_WARPCTC_WRAP_H_
 #define HL_WARPCTC_WRAP_H_
-
 #include "ctc.h"
 #include "hl_base.h"
 
@@ -91,3 +91,4 @@ extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
                                           size_t* bytes);
 
 #endif  // HL_WARPCTC_WRAP_H_
+#endif
diff --git a/paddle/legacy/cuda/src/hl_cuda_device.cc b/paddle/legacy/cuda/src/hl_cuda_device.cc
index 501e3b0f3b..92197afb3d 100644
--- a/paddle/legacy/cuda/src/hl_cuda_device.cc
+++ b/paddle/legacy/cuda/src/hl_cuda_device.cc
@@ -132,10 +132,14 @@ inline pid_t gettid() {
   uint64_t tid;
   pthread_threadid_np(NULL, &tid);
 #else
+#ifndef _WIN32
 #ifndef __NR_gettid
 #define __NR_gettid 224
 #endif
   pid_t tid = syscall(__NR_gettid);
+#else   // _WIN32
+  pid_t tid = _getpid();
+#endif  // _WIN32
 #endif
   CHECK_NE((int)tid, -1);
   return tid;
diff --git a/paddle/legacy/pserver/ParameterClient2.cpp b/paddle/legacy/pserver/ParameterClient2.cpp
index 4c544ddc28..264faa7918 100644
--- a/paddle/legacy/pserver/ParameterClient2.cpp
+++ b/paddle/legacy/pserver/ParameterClient2.cpp
@@ -224,14 +224,14 @@ void ParameterClient2::prepareSendData(
     request.set_cost(cost);
     request.set_batch_status(batchStatus);
     CHECK_EQ(request.blocks_size(), 0);
-    VLOG(10) << "request: trainer_id: " << request.trainer_id()
-             << " update_mode" << request.update_mode()
-             << " send_back_parameter: " << request.send_back_parameter()
-             << " send_back_parameter_type: "
-             << request.send_back_parameter_type()
-             << " num_samples: " << request.num_samples()
-             << " cost: " << request.cost()
-             << " batch_status: " << request.batch_status();
+    VLOG(1) << "request: trainer_id: " << request.trainer_id() << " update_mode"
+            << request.update_mode()
+            << " send_back_parameter: " << request.send_back_parameter()
+            << " send_back_parameter_type: "
+            << request.send_back_parameter_type()
+            << " num_samples: " << request.num_samples()
+            << " cost: " << request.cost()
+            << " batch_status: " << request.batch_status();
   }
   for (const auto& segments : parameterSegments) {
     const auto it = parameterMap_.find(segments.id);
diff --git a/paddle/legacy/utils/ThreadLocal.h b/paddle/legacy/utils/ThreadLocal.h
index c5b07506d3..6268b73a85 100644
--- a/paddle/legacy/utils/ThreadLocal.h
+++ b/paddle/legacy/utils/ThreadLocal.h
@@ -14,10 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef _WIN32
 #include <pthread.h>
 #include <sys/syscall.h>
-#include <sys/types.h>
 #include <unistd.h>
+#endif
+#include <sys/types.h>
 #include <map>
 #include <mutex>
 #include <random>
diff --git a/paddle/legacy/utils/Util.h b/paddle/legacy/utils/Util.h
index e6f05e30d3..3a878b2b30 100644
--- a/paddle/legacy/utils/Util.h
+++ b/paddle/legacy/utils/Util.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef _WIN32
 #include <sys/syscall.h>  // for syscall()
+#endif
 #include <sys/types.h>
 #include <algorithm>
 #include <cmath>
@@ -40,6 +42,31 @@ inline int rand_r(unsigned int* seedp) {
 }
 #endif
 
+#ifdef _WIN32
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#include <windows.h>
+
+template <typename T>
+inline int __builtin_clz(const T& value) {
+  DWORD leadning_zero = 0;
+  if (_BitScanReverse(&leadning_zero, value)) {
+    return static_cast<int>(sizeof(T) * 8 - leadning_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+inline int __builtin_clzl(const unsigned long& value) {
+  return __builtin_clz(value);
+}
+
+inline int __builtin_clzll(const unsigned long long& value) {
+  return __builtin_clz(value);
+}
+
+#define pid_t int
+#endif
+
 /**
  * Loop over the elements in a container
  * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9632eaec00..62a2b08a8a 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -149,7 +149,7 @@ function cmake_gen() {
             elif [ "$1" == "cp37-cp37m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
-                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
            fi
@@ -440,11 +440,31 @@ EOF
         ctest --output-on-failure -j $1
         # make install should also be test when unittest
         make install -j 8
-        pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        if [ "$1" == "cp27-cp27m" ]; then
+            pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+            set -e
+            python -c "import paddle.fluid"
+        elif [ "$1" == "cp35-cp35m" ]; then
+            pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp36-cp36m" ]; then
+            pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp37-cp37m" ]; then
+            pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        fi
+      
         if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
             paddle version
         fi
-        pip uninstall -y paddlepaddle
+
+        if [ "$1" == "cp27-cp27m" ]; then
+            pip uninstall -y paddlepaddle
+        elif [ "$1" == "cp35-cp35m" ]; then
+            pip3.5 uninstall -y paddlepaddle
+        elif [ "$1" == "cp36-cp36m" ]; then
+            pip3.6 uninstall -y paddlepaddle
+        elif [ "$1" == "cp37-cp37m" ]; then
+            pip3.7 uninstall -y paddlepaddle
+        fi
     fi
 }
 
@@ -469,18 +489,21 @@ function assert_api_spec_approvals() {
         BRANCH="develop"
     fi
 
-    API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/API.spec" || true`
-    echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
-    if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
-        # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-        python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
-        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
-        if [ "${APPROVALS}" == "FALSE" ]; then
-            echo "You must have at least 2 approvals for the api change!"
-        exit 1
-        fi
-    fi
+    API_FILES=("paddle/fluid/API.spec" "paddle/fluid/framework/operator.h")
+    for API_FILE in ${API_FILES[*]}; do
+      API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true`
+      echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
+      if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
+          # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
+          APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+          python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
+          echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+          if [ "${APPROVALS}" == "FALSE" ]; then
+              echo "You must have at least 2 approvals for the api change! ${API_FILE}"
+          exit 1
+          fi
+      fi
+    done
 }
 
 
@@ -671,6 +694,55 @@ EOF
     ${DOCKERFILE_CUBLAS_DSO}
     ${DOCKERFILE_GPU_ENV}
     ENV NCCL_LAUNCH_MODE PARALLEL
+EOF
+    elif [ "$1" == "cp36-cp36m" ]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \
+        tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \
+        ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \
+        wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \
+        tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && \
+        pip3.6 install opencv-python && pip3.6 install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ENV NCCL_LAUNCH_MODE PARALLEL
+EOF
+    elif [ "$1" == "cp37-cp37m" ]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
+        tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && \
+        pip3.7 install opencv-python && pip3.7 install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ENV NCCL_LAUNCH_MODE PARALLEL
 EOF
     else
         cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 2264481899..614596958e 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -3,8 +3,10 @@
 if(WITH_TESTING)
   add_library(paddle_test_main STATIC TestMain.cpp)
   add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
-  add_library(paddle_test_util STATIC TestUtil.cpp)
-  add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  if(NOT WIN32)
+    add_library(paddle_test_util STATIC TestUtil.cpp)
+    add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  endif(NOT WIN32)
   if(NOT MOBILE_INFERENCE)
     cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
   endif()
diff --git a/paddle/testing/TestUtil.cpp b/paddle/testing/TestUtil.cpp
index fa1888966d..fa8efc20f5 100644
--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -118,7 +118,7 @@ void generateSequenceStartPositions(size_t batchSize,
     }
     buf[i] = pos;
     pos += len;
-    VLOG(10) << " len=" << len;
+    VLOG(1) << " len=" << len;
   }
   buf[numSeqs] = batchSize;
 }
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index aa66696fae..1052d24c57 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -71,15 +71,16 @@ def __build_dict(tar_file, dict_size, save_path, lang):
             for w in sen.split():
                 word_dict[w] += 1
 
-    with open(save_path, "w") as fout:
-        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
+    with open(save_path, "wb") as fout:
+        fout.write(
+            cpt.to_bytes("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)))
         for idx, word in enumerate(
                 sorted(
                     six.iteritems(word_dict), key=lambda x: x[1],
                     reverse=True)):
             if idx + 3 == dict_size: break
-            fout.write(word[0].encode('utf-8'))
-            fout.write('\n')
+            fout.write(cpt.to_bytes(word[0]))
+            fout.write(cpt.to_bytes('\n'))
 
 
 def __load_dict(tar_file, dict_size, lang, reverse=False):
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index f7fefb3e5b..a1ffbf4262 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -20,6 +20,13 @@ from .framework import *
 # import all class inside executor into fluid module
 from . import executor
 from .executor import *
+
+from . import data_feed_desc
+from .data_feed_desc import *
+
+from . import async_executor
+from .async_executor import *
+
 from . import trainer
 from . import inferencer
 
@@ -54,7 +61,8 @@ Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + \
     trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-    parallel_executor.__all__ + lod_tensor.__all__ + [
+    parallel_executor.__all__ + lod_tensor.__all__ + \
+    data_feed_desc.__all__ + async_executor.__all__ + [
         'io',
         'initializer',
         'layers',
diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py
new file mode 100644
index 0000000000..2664a7301d
--- /dev/null
+++ b/python/paddle/fluid/async_executor.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import contextlib
+import six
+from .framework import Program, default_main_program, Variable
+from . import core
+from .executor import global_scope, Executor
+from paddle.fluid.proto import data_feed_pb2
+from google.protobuf import text_format
+from . import io
+from .data_feed_desc import DataFeedDesc
+
+__all__ = ['AsyncExecutor']
+
+
+class AsyncExecutor(object):
+    """
+    An asynchronous Executor in Python. Through exploiting the power of
+    multi-core processor and data queueing, AsyncExecutor makes data reading
+    and cosuming decoupled, each run in multiple threads in parallel.
+
+    Instead of reading data in python side, AsyncExecutor accepts a training
+    file list, which will be retrieved in C++, then training inputs will be
+    read, parsed and fed to training network within C++ code.
+
+    AsyncExecutor is in active development and the API might change in the near
+    future.
+
+    Example:
+        >>> data_feed = fluid.DataFeedDesc('data.proto')
+        >>> startup_program = fluid.default_startup_program()
+        >>> main_program = fluid.default_main_program()
+        >>> filelist = ["train_data/part-%d" % i for i in range(100)]
+        >>> thread_num = len(filelist) / 4
+        >>>
+        >>> place = fluid.CPUPlace()
+        >>> async_executor = fluid.AsyncExecutor(place)
+        >>>
+        >>> async_executor.run_startup_program(startup_program)
+        >>>
+        >>> epoch = 10
+        >>> for i in range(epoch):
+        >>>     async_executor.run(main_program,
+        >>>                        data_feed,
+        >>>                        filelist,
+        >>>                        thread_num,
+        >>>                        [acc],
+        >>>                        debug=False)
+
+    Args:
+        place(fluid.CPUPlace|None): indicate the executor run on which device.
+                                   Only CPUPlace supported
+
+    Note:
+        For debugging complicated network in parallel-GPUs, you can test it
+        on the executor. They has the exactly same arguments, and expected
+        the same results.
+
+    Note: Only running on CPUPlace supported.
+    """
+
+    def __init__(self, place=None):
+        if place is None:
+            place = core.CPUPlace()
+        if not isinstance(place, core.CPUPlace):
+            raise ValueError("AsyncExecutor only supports CPU device")
+
+        p = core.Place()
+        p.set_place(place)
+
+        scope = global_scope()
+        self.executor = core.AsyncExecutor(scope, p)
+
+    def run(self, program, data_feed, filelist, thread_num, fetch, debug=False):
+        """
+        Run program by this AsyncExecutor. Training dataset will be in filelist.
+        Users can also inspect certain variables by naming them in parameter
+        :code:`fetch`, like in fluid.Executor. Unlike fluid.Executor, however,
+        AsyncExecutor doesn't return fetched variables, instead, it will dump
+        the values of each fetched variable to stdandard output.
+
+        Running the dataset will be on multiple threads, within each a thread
+        local scope will be created, then all OPs also created in that scope.
+        Parameters are updated by all the OPs simultaneously.
+
+        Args:
+            program(Program): the program that need to run, if not provied,
+                              then default_main_program will be used.
+            data_feed(DataFeedDesc): A DataFeedDesc object
+            filelist(str): a file containing the training dataset file list
+            thread_num(int): number of concurrent training threads. See
+                             :code:`Note` for how to set this properly
+            fetch(str|list): the var name or a list of var names to inspect
+            debug(bool): When set to True, fetch vars will be printed to
+                         standard output after each minibatch
+
+        Note:
+            the executor will run all operators in the program but not only
+            the operators dependent by the fetch_list.
+
+        Note:
+            Running AsyncExecutor will be on multiple threads, each bound to a
+            CPU core. To achieve best performance, it's suggested to set thread
+            num to be equal or slightly less than that of CPU cores.
+        """
+        if program is None:
+            program = default_main_program()
+        program_desc = program.desc
+
+        if data_feed is None:
+            raise ValueError('ValueError: data_feed should be provided')
+
+        if filelist is None:
+            raise ValueError('ValueError: filelist should be provided')
+
+        if isinstance(filelist, str):
+            filelist = [filelist]
+
+        if not isinstance(thread_num, int):
+            raise TypeError('TypeError: thread_num should be a positive number')
+
+        if fetch is not None:
+            if isinstance(fetch, Variable):
+                fetch = [fetch]
+            fetch_var_names = [var.name for var in fetch]
+            for fetch_var in fetch:
+                shape = fetch_var.shape
+                if shape[len(shape) - 1] != 1:
+                    raise AssertionError(
+                        "%s: Fetch variable has wrong shape. Only varibles "
+                        "with the last dimension size 1 supported." %
+                        (fetch_var.name))
+
+        self.executor.run_from_files(program_desc,
+                                     data_feed.desc(), filelist, thread_num,
+                                     fetch_var_names, debug)
diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py
new file mode 100644
index 0000000000..b8449e8d84
--- /dev/null
+++ b/python/paddle/fluid/contrib/reader/ctr_reader.py
@@ -0,0 +1,123 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from paddle.fluid import core
+from paddle.fluid.executor import global_scope
+from paddle.fluid.framework import default_main_program, \
+    default_startup_program, Variable
+from paddle.fluid.unique_name import generate as unique_name
+
+
+def monkey_patch_reader_methods(reader):
+    def __get_reader__():
+        scope = global_scope()
+        var = scope.find_var(reader.name)
+        return var.get_reader()
+
+    def reset():
+        return __get_reader__().reset()
+
+    reader.reset = reset
+    reader.stop_gradient = True
+    reader.persistable = True
+    return reader
+
+
+def _copy_reader_var_(block, var):
+    new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
+    new_var.desc.set_shapes(var.desc.shapes())
+    new_var.desc.set_dtypes(var.desc.dtypes())
+    new_var.persistable = True
+    return new_var
+
+
+def ctr_reader(feed_data,
+               capacity,
+               thread_num,
+               batch_size,
+               file_list,
+               slots,
+               name=None):
+    """
+    Create a CTR reader for data feeding in Python
+
+    This layer returns a Reader Variable.
+    The Reader provides :code:`decorate_paddle_reader()` and
+    :code:`decorate_tensor_provider()` to set a Python generator as the data
+    source in Python side. When :code:`Executor::Run()` is invoked in C++
+    side, the data from the generator would be read automatically. Unlike
+    :code:`DataFeeder.feed()`, the data reading process and
+    :code:`Executor::Run()` process can run in parallel using
+    :code:`py_reader`. The :code:`start()` method of the Reader should be
+    called when each pass begins, while the :code:`reset()` method should be
+    called when the pass ends and :code:`fluid.core.EOFException` raises.
+    Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
+
+    Args:
+       capacity(int): The buffer capacity maintained by :code:`py_reader`.
+       thread_num(list|tuple): List of tuples which declaring data shapes.
+       batch_size(list|tuple): List of strs which declaring data type.
+       file_list(list|tuple): List of ints which declaring data lod_level.
+       slots(bool): Whether use double buffer or not.
+       name(basestring): The prefix Python queue name and Reader name. None will
+            be generated automatically.
+
+    Returns:
+       Variable: A Reader from which we can get feeding data.
+
+    Examples:
+
+        1. The basic usage of :code:`py_reader` is as follows:
+    """
+    if name is None:
+        queue_name = unique_name('lod_tensor_blocking_queue')
+        reader_name = unique_name('create_ctr_reader')
+    else:
+        queue_name = "_".join([name, "queue"])
+        reader_name = "_".join([name, "reader"])
+
+    var = global_scope().var(queue_name)
+    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
+
+    startup_blk = default_startup_program().current_block()
+    reader_var = startup_blk.create_var(name=reader_name)
+    startup_blk.append_op(
+        type='create_ctr_reader',
+        inputs={'blocking_queue': [queue_name]},
+        outputs={'Out': [reader_var]},
+        attrs={
+            'thread_num': thread_num,
+            'batch_size': batch_size,
+            'file_list': file_list,
+            'slots': slots,
+        })
+
+    reader_var.persistable = True
+
+    main_prog_reader_var = _copy_reader_var_(
+        default_main_program().current_block(), reader_var)
+
+    reader = monkey_patch_reader_methods(main_prog_reader_var)
+
+    # monkey patch py_reader special methods
+    reader.queue = feed_queue
+    reader.exited = False
+
+    main_blk = default_main_program().current_block()
+    main_blk.append_op(
+        type='read', inputs={'Reader': [reader]}, outputs={'Out': feed_data})
+
+    return reader
diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py
new file mode 100644
index 0000000000..d2ec74d6cf
--- /dev/null
+++ b/python/paddle/fluid/data_feed_desc.py
@@ -0,0 +1,152 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.proto import data_feed_pb2
+from google.protobuf import text_format
+
+__all__ = ['DataFeedDesc']
+
+
+class DataFeedDesc(object):
+    """
+    Datafeed descriptor, describing input training data format. This class is
+    currently only used for AsyncExecutor (See comments for class AsyncExecutor
+    for a brief introduction)
+
+    DataFeedDesc shall be initialized from a valid protobuf message from disk:
+    >>> data_feed = fluid.DataFeedDesc('data.proto')
+
+    See :code:`paddle/fluid/framework/data_feed.proto` for message definition.
+    A typical message might look like:
+
+    >>> name: "MultiSlotDataFeed"
+    >>> batch_size: 2
+    >>> multi_slot_desc {
+    >>>     slots {
+    >>>         name: "words"
+    >>>         type: "uint64"
+    >>>         is_dense: false
+    >>>         is_used: true
+    >>>     }
+    >>>     slots {
+    >>>         name: "label"
+    >>>         type: "uint64"
+    >>>         is_dense: false
+    >>>         is_used: true
+    >>>     }
+    >>> }
+
+    However, users usually shouldn't care about the message format; instead,
+    they are encouragd to use :code:`Data Generator` as a tool to generate a
+    valid data description, in the process of converting their raw log files to
+    training files acceptable to AsyncExecutor.
+
+    DataFeedDesc can also be changed during runtime. Once you got familiar with
+    what each field mean, you can modify it to better suit your need. E.g.:
+    >>> data_feed.set_batch_size(128)
+    >>> data_feed.set_dense_slots('wd')  # The slot named 'wd' will be dense
+    >>> data_feed.set_use_slots('wd')    # The slot named 'wd' will be used
+
+    Finally, the content can be dumped out for debugging purpose:
+    >>> print(data_feed.desc())
+
+    Args:
+        proto_file(string): Disk file containing a data feed description.
+    
+    """
+
+    def __init__(self, proto_file):
+        self.proto_desc = data_feed_pb2.DataFeedDesc()
+        with open(proto_file, 'r') as f:
+            text_format.Parse(f.read(), self.proto_desc)
+        if self.proto_desc.name == "MultiSlotDataFeed":
+            self.__name_to_index = {
+                slot.name: i
+                for i, slot in enumerate(self.proto_desc.multi_slot_desc.slots)
+            }
+
+    def set_batch_size(self, batch_size):
+        """
+        Set batch size. Will be effective during training
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_batch_size(128)
+
+        Args:
+            batch_size: batch size
+
+        """
+        self.proto_desc.batch_size = batch_size
+
+    def set_dense_slots(self, dense_slots_name):
+        """
+        Set if a specific slot will be dense. Will be effective during training.
+        features for a dense slot will be fed into a Tensor, while those for a
+        sparse slot will be fed into a LoDTensor
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_dense_slots(['words'])
+
+        Args:
+            dense_slots_name: a list of slot names which will be set dense
+
+        Note:
+            Default is sparse for all slots
+        """
+        if self.proto_desc.name != "MultiSlotDataFeed":
+            raise ValueError(
+                "Only MultiSlotDataFeed need set_dense_slots, pls check your datafeed.proto"
+            )
+        for name in dense_slots_name:
+            self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
+                name]].is_dense = True
+
+    def set_use_slots(self, use_slots_name):
+        """
+        Set if a specific slot will be used for training. A dataset shall
+        contain a lot of features, through this function one can select which
+        ones will be used for a specific model.
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_use_slots(['words'])
+
+        Args:
+            use_slots_name: a list of slot names which will be used in training
+
+        Note:
+            Default is not used for all slots
+        """
+        if self.proto_desc.name != "MultiSlotDataFeed":
+            raise ValueError(
+                "Only MultiSlotDataFeed need set_use_slots, pls check your datafeed.proto"
+            )
+        for name in use_slots_name:
+            self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
+                name]].is_used = True
+
+    def desc(self):
+        """
+        Returns a protobuf message for this DataFeedDesc
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> print(data_feed.desc())
+
+        Returns:
+            A string message
+        """
+        return text_format.MessageToString(self.proto_desc)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 288951cd7c..42c2484b28 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -278,6 +278,7 @@ class Executor(object):
         p = core.Place()
         p.set_place(place)
         self.executor = core.Executor(p)
+
         self.program_caches = dict()
         self._closed = False
 
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index a26b8df5a2..b37ebbe517 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -33,13 +33,15 @@ def force_init_on_cpu():
     """
     The flag of whether force to init variables on CPU.
 
-    Returns::
+    Returns:
+        bool: the state if we should force init on CPU.
 
     Examples:
+
         .. code-block:: python
 
             if force_init_on_cpu():
-                pass
+                create_op('force_cpu': force_init_on_cpu())
 
     """
     return _force_init_on_cpu_
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 26d7af87b3..0782933c6c 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -637,8 +637,8 @@ def save_inference_model(dirname,
     if isinstance(target_vars, Variable):
         target_vars = [target_vars]
     elif export_for_deployment:
-        if not (bool(target_vars) and all(
-                isinstance(var, Variable) for var in target_vars)):
+        if not (bool(target_vars) and
+                all(isinstance(var, Variable) for var in target_vars)):
             raise ValueError("'target_vars' should be a list of Variable.")
 
     if main_program is None:
@@ -667,10 +667,15 @@ def save_inference_model(dirname,
     if export_for_deployment:
         main_program = main_program.clone()
         global_block = main_program.global_block()
+        need_to_remove_op_index = []
         for i, op in enumerate(global_block.ops):
             op.desc.set_is_target(False)
             if op.type == "feed" or op.type == "fetch":
-                global_block._remove_op(i)
+                need_to_remove_op_index.append(i)
+
+        for index in need_to_remove_op_index[::-1]:
+            global_block._remove_op(index)
+
         main_program.desc.flush()
 
         main_program = main_program._prune(targets=target_vars)
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 4843af8340..ce731f39ea 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from .layer_function_generator import generate_layer_fn
 from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
+from ..framework import Variable
 from . import tensor
 from . import nn
 from . import ops
@@ -46,6 +47,7 @@ __all__ = [
     'iou_similarity',
     'box_coder',
     'polygon_box_transform',
+    'yolov3_loss',
 ]
 
 
@@ -401,6 +403,113 @@ def polygon_box_transform(input, name=None):
     return output
 
 
+@templatedoc(op_type="yolov3_loss")
+def yolov3_loss(x,
+                gtbox,
+                gtlabel,
+                anchors,
+                class_num,
+                ignore_thresh,
+                loss_weight_xy=None,
+                loss_weight_wh=None,
+                loss_weight_conf_target=None,
+                loss_weight_conf_notarget=None,
+                loss_weight_class=None,
+                name=None):
+    """
+    ${comment}
+
+    Args:
+        x (Variable): ${x_comment}
+        gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4],
+                          in the third dimenstion, x, y, w, h should be stored 
+                          and x, y, w, h should be relative value of input image.
+                          N is the batch number and B is the max box number in 
+                          an image.
+        gtlabel (Variable): class id of ground truth boxes, shoud be ins shape
+                            of [N, B].
+        anchors (list|tuple): ${anchors_comment}
+        class_num (int): ${class_num_comment}
+        ignore_thresh (float): ${ignore_thresh_comment}
+        loss_weight_xy (float|None): ${loss_weight_xy_comment}
+        loss_weight_wh (float|None): ${loss_weight_wh_comment}
+        loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment}
+        loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment}
+        loss_weight_class (float|None): ${loss_weight_class_comment}
+        name (string): the name of yolov3 loss
+
+    Returns:
+        Variable: A 1-D tensor with shape [1], the value of yolov3 loss
+
+    Raises:
+        TypeError: Input x of yolov3_loss must be Variable
+        TypeError: Input gtbox of yolov3_loss must be Variable"
+        TypeError: Input gtlabel of yolov3_loss must be Variable"
+        TypeError: Attr anchors of yolov3_loss must be list or tuple
+        TypeError: Attr class_num of yolov3_loss must be an integer
+        TypeError: Attr ignore_thresh of yolov3_loss must be a float number
+
+    Examples:
+    .. code-block:: python
+
+        x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
+        gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
+        gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
+        anchors = [10, 13, 16, 30, 33, 23]
+        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80
+                                        anchors=anchors, ignore_thresh=0.5)
+    """
+    helper = LayerHelper('yolov3_loss', **locals())
+
+    if not isinstance(x, Variable):
+        raise TypeError("Input x of yolov3_loss must be Variable")
+    if not isinstance(gtbox, Variable):
+        raise TypeError("Input gtbox of yolov3_loss must be Variable")
+    if not isinstance(gtlabel, Variable):
+        raise TypeError("Input gtlabel of yolov3_loss must be Variable")
+    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
+        raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
+    if not isinstance(class_num, int):
+        raise TypeError("Attr class_num of yolov3_loss must be an integer")
+    if not isinstance(ignore_thresh, float):
+        raise TypeError(
+            "Attr ignore_thresh of yolov3_loss must be a float number")
+
+    if name is None:
+        loss = helper.create_variable_for_type_inference(dtype=x.dtype)
+    else:
+        loss = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+
+    attrs = {
+        "anchors": anchors,
+        "class_num": class_num,
+        "ignore_thresh": ignore_thresh,
+    }
+
+    if loss_weight_xy is not None and isinstance(loss_weight_xy, float):
+        self.attrs['loss_weight_xy'] = loss_weight_xy
+    if loss_weight_wh is not None and isinstance(loss_weight_wh, float):
+        self.attrs['loss_weight_wh'] = loss_weight_wh
+    if loss_weight_conf_target is not None and isinstance(
+            loss_weight_conf_target, float):
+        self.attrs['loss_weight_conf_target'] = loss_weight_conf_target
+    if loss_weight_conf_notarget is not None and isinstance(
+            loss_weight_conf_notarget, float):
+        self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget
+    if loss_weight_class is not None and isinstance(loss_weight_class, float):
+        self.attrs['loss_weight_class'] = loss_weight_class
+
+    helper.append_op(
+        type='yolov3_loss',
+        inputs={"X": x,
+                "GTBox": gtbox,
+                "GTLabel": gtlabel},
+        outputs={'Loss': loss},
+        attrs=attrs)
+    return loss
+
+
 @templatedoc()
 def detection_map(detect_res,
                   label,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7af1f380e7..ac1c865775 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -169,8 +169,11 @@ __all__ = [
     'log_loss',
     'add_position_encoding',
     'bilinear_tensor_product',
+    'lstm',
 ]
 
+kIgnoreIndex = -100
+
 
 def fc(input,
        size,
@@ -326,6 +329,11 @@ def embedding(input,
     """
 
     helper = LayerHelper('embedding', **locals())
+    remote_prefetch = False
+    if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'):
+        remote_prefetch = True
+    if remote_prefetch:
+        assert is_sparse is True and is_distributed is False
     w = helper.create_parameter(
         attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
     tmp = helper.create_variable_for_type_inference(dtype)
@@ -339,6 +347,7 @@ def embedding(input,
         attrs={
             'is_sparse': is_sparse,
             'is_distributed': is_distributed,
+            'remote_prefetch': remote_prefetch,
             'padding_idx': padding_idx
         })
     return tmp
@@ -466,6 +475,168 @@ def dynamic_lstm(input,
     return hidden, cell
 
 
+def lstm(input,
+         init_h,
+         init_c,
+         max_len,
+         hidden_size,
+         num_layers,
+         dropout_prob=0.0,
+         is_bidirec=False,
+         is_test=False,
+         name=None,
+         default_initializer=None,
+         seed=-1):
+    """
+    If Device is GPU, This op will use cudnn LSTM implementation
+
+    A four-gate Long Short-Term Memory network with no peephole connections.
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+    the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
+
+    $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
+
+    $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
+
+    $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
+
+    $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
+
+    $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
+
+    $$ h_t = o_t \\odot tanh(c_t) $$
+
+    - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+      of weights from the input gate to the input)
+    - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
+    - sigmoid is the logistic sigmoid function.
+    - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+      and cell activation vectors, respectively, all of which have the same size as
+      the cell output activation vector $h$.
+    - The $\odot$ is the element-wise product of the vectors.
+    - `tanh` is the activation functions.
+    - $\tilde{c_t}$ is also called candidate hidden state,
+      which is computed based on the current input and the previous hidden state.
+
+    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+    X represensts a matrix multiplication
+
+
+    Args:
+        input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
+        init_h(Variable): The initial hidden state of the LSTM                       
+                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
+                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
+        init_c(Variable): The initial cell state of the LSTM.
+                       This is a tensor with shape ( num_layers x batch_size x hidden_size )
+                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
+        hidden_size (int): hidden size of the LSTM
+        num_layers (int): total layers number of the LSTM
+        dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
+                             There is NO dropout work on rnn output of the last RNN layers
+        is_bidirec (bool): If it is bidirectional
+        is_test (bool): If it is in test phrase
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
+        default_initializer(Initialize|None): Where use initializer to initialize the Weight
+                         If set None, defaule initializer will be used
+        seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
+
+
+    Returns:
+        rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
+                         if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
+        last_h(Tensor): the hidden state of the last step of LSTM
+                        shape is ( num_layers x batch_size x hidden_size )
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+        last_c(Tensor): the cell state of the last step of LSTM
+                        shape is ( num_layers x batch_size x hidden_size )
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+
+
+    Examples:
+        .. code-block:: python
+
+            input = embedding
+            batch_size = 20
+            max_len = 100
+            dropout_prob = 0.2
+            input_size = 100
+            hidden_size = 150
+            num_layers = 1
+            init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
+            init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
+
+            rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \
+                    max_len, dropout_prob, input_size, hidden_size, \
+                    num_layers)
+    """
+
+    helper = LayerHelper('cudnn_lstm', **locals())
+
+    dtype = input.dtype
+    input_shape = list(input.shape)
+    input_size = input_shape[-1]
+    weight_size = 0
+    for i in range(num_layers):
+        if i == 0:
+            input_weight_size = (input_size * hidden_size) * 4
+        else:
+            if is_bidirec:
+                input_weight_size = (hidden_size * 2 * hidden_size) * 4
+            else:
+                input_weight_size = (hidden_size * hidden_size) * 4
+
+        hidden_weight_size = (hidden_size * hidden_size) * 4
+
+        if is_bidirec:
+            weight_size += (input_weight_size + hidden_weight_size) * 2
+            weight_size += hidden_size * 8 * 2
+        else:
+            weight_size += input_weight_size + hidden_weight_size
+            weight_size += hidden_size * 8
+
+    weight = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[weight_size],
+        dtype=dtype,
+        default_initializer=default_initializer)
+
+    out = helper.create_variable_for_type_inference(dtype)
+    last_h = helper.create_variable_for_type_inference(dtype)
+    last_c = helper.create_variable_for_type_inference(dtype)
+
+    cache = helper.create_variable(
+        persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True)
+
+    helper.append_op(
+        type='cudnn_lstm',
+        inputs={
+            'Input': input,
+            'InitH': init_h,
+            'InitC': init_c,
+            'W': weight,
+            'Cache': cache,
+        },
+        outputs={
+            'Out': out,
+            'last_h': last_h,
+            'last_c': last_c,
+        },
+        attrs={
+            'max_len': max_len,
+            'is_bidirec': is_bidirec,
+            'input_size': input_size,
+            'hidden_size': hidden_size,
+            'num_layers': num_layers,
+            'is_test': is_test,
+            'dropout_prob': dropout_prob,
+            'seed': seed,
+        })
+    return out, last_h, last_c
+
+
 def dynamic_lstmp(input,
                   size,
                   proj_size,
@@ -1098,7 +1269,7 @@ def dropout(x,
     return out
 
 
-def cross_entropy(input, label, soft_label=False, ignore_index=-100):
+def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
     """
     **Cross Entropy Layer**
 
@@ -1145,7 +1316,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
                                            labels. Default: `False`.
         ignore_index (int): Specifies a target value that is ignored and does
                             not contribute to the input gradient. Only valid
-                            if soft_label is set to False. Default: -100
+                            if soft_label is set to False. Default: kIgnoreIndex
 
     Returns:
          A 2-D tensor with shape [N x 1], the cross entropy loss.
@@ -2300,7 +2471,8 @@ def batch_norm(input,
                moving_mean_name=None,
                moving_variance_name=None,
                do_model_average_for_mean_and_var=False,
-               fuse_with_relu=False):
+               fuse_with_relu=False,
+               use_global_stats=False):
     """
     **Batch Normalization Layer**
 
@@ -2327,6 +2499,19 @@ def batch_norm(input,
         \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
+
+    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global (or running) statistics. (It usually got from the
+    pre-trained model.)
+    The training and testing (or inference) have the same behavior:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta
+
     Args:
         input(variable): The input variable which is a LoDTensor.
         act(string, Default None): Activation type, linear|relu|prelu|...
@@ -2349,6 +2534,11 @@ def batch_norm(input,
         moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
         do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
         fuse_with_relu (bool): if True, this OP performs relu after batch norm.
+        use_global_stats(bool, Default False): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period.
 
     Returns:
         Variable: A tensor variable which is the result after applying batch normalization on the input.
@@ -2381,9 +2571,15 @@ def batch_norm(input,
         shape=param_shape,
         dtype=dtype,
         default_initializer=Constant(1.0))
+    # setting stop_gradient=True to reduce computation
+    if use_global_stats and helper.param_attr.learning_rate == 0.:
+        scale.stop_gradient = True
 
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+    # setting stop_gradient=True to reduce computation
+    if use_global_stats and helper.bias_attr.learning_rate == 0.:
+        scale.stop_gradient = True
 
     mean = helper.create_parameter(
         attr=ParamAttr(
@@ -2439,7 +2635,8 @@ def batch_norm(input,
             "epsilon": epsilon,
             "is_test": is_test,
             "use_mkldnn": False,
-            "fuse_with_relu": fuse_with_relu
+            "fuse_with_relu": fuse_with_relu,
+            "use_global_stats": use_global_stats
         })
 
     return helper.append_activation(batch_norm_out)
@@ -4218,8 +4415,15 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.5, 0.1, 0.3, 0.1]]
 
         input.lod = [[4, 4]]
+      
+        Computation:
 
-        Then:
+        step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
+               [[0], [2], [1], [0]]
+        step2: merge repeated tokens and remove blank which is 0. Then we get first output sequence:
+               [[2], [1]]
+
+        Finally:
 
         output.data = [[2],
                        [1],
@@ -4227,6 +4431,7 @@ def ctc_greedy_decoder(input, blank, name=None):
 
         output.lod = [[2, 1]]
 
+
     Args:
 
         input(Variable): (LoDTensor<float>), the probabilities of
@@ -4241,8 +4446,10 @@ def ctc_greedy_decoder(input, blank, name=None):
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: CTC greedy decode result. If all the sequences in result were
-        empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1].
+        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
+                  'Lp' is the sum if all output sequences' length. If all the sequences
+                  in result were empty, the result LoDTensor will be [-1] with 
+                  LoD [[]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
@@ -4394,7 +4601,8 @@ def nce(input,
         name=None,
         sampler="uniform",
         custom_dist=None,
-        seed=0):
+        seed=0,
+        is_sparse=False):
     """
     ${comment}
 
@@ -4420,11 +4628,12 @@ def nce(input,
         sampler (str): The sampler used to sample class from negtive classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
                        default: 'uniform'.
-        custom_dist (Variable): A tensor with shape [num_total_classes].
+        custom_dist (float[]): A float[] with size=num_total_classes.
                        It is used when sampler is set to 'custom_dist'.
                        custom_dist[i] is the probsbility of i-th class to be sampled.
                        default: None.
         seed (int): The seed used in sampler. default: 0.
+        is_sparse(bool): The flag indicating whether to use sparse update, the weight@GRAD and bias@GRAD will be changed to SelectedRows.
 
     Returns:
         Variable: The output nce loss.
@@ -4476,12 +4685,7 @@ def nce(input,
         shape=[num_total_classes, dim],
         is_bias=False,
         dtype=input.dtype)
-    inputs = {
-        'Input': input,
-        'Label': label,
-        'Weight': w,
-        'SampleWeight': sample_weight if sample_weight is not None else []
-    }
+    inputs = {}
     if helper.bias_attr:
         b = helper.create_parameter(
             attr=helper.bias_attr,
@@ -4493,18 +4697,10 @@ def nce(input,
     sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype)
     sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype)
 
-    if num_neg_samples is None:
-        num_neg_samples = 10
-    else:
-        num_neg_samples = int(num_neg_samples)
-
-    inputs = {
-        'Input': input,
-        'Label': label,
-        'Weight': w,
-        'Bias': b,
-        'SampleWeight': sample_weight if sample_weight is not None else []
-    }
+    inputs['Input'] = input
+    inputs['Label'] = label
+    inputs['Weight'] = w
+    inputs['SampleWeight'] = sample_weight if sample_weight is not None else []
 
     if sampler == "uniform":
         sampler = 0
@@ -4512,17 +4708,73 @@ def nce(input,
         sampler = 1
     elif sampler == "custom_dist":
         assert custom_dist is not None
-        assert isinstance(custom_dist, Variable)
-        inputs['CustomDistribution'] = custom_dist
+        # assert isinstance(custom_dist, Variable)
+
+        custom_dist_len = len(custom_dist)
+        alias_probs_ = [0] * custom_dist_len
+        alias_ = [0] * custom_dist_len
+        bigs = []
+        littles = []
+        for i in range(custom_dist_len):
+            normal_prob = custom_dist[i] * custom_dist_len
+            if normal_prob - 1.0 > 1e-4:
+                bigs.append((i, normal_prob))
+            elif 1.0 - normal_prob > 1e-4:
+                littles.append((i, normal_prob))
+            else:
+                alias_probs_[i] = normal_prob
+                alias_[i] = -1
+
+        while len(bigs) and len(littles):
+            big = bigs.pop(0)
+            little = littles.pop(0)
+
+            big_idx = big[0]
+            big_prob = big[1]
+
+            alias_probs_[little[0]] = little[1]
+            alias_[little[0]] = big_idx
+            big_left = big[1] + little[1] - 1
+            if big_left - 1.0 > 1e-4:
+                bigs.append((big_idx, big_left))
+            elif 1.0 - big_left > 1e-4:
+                littles.append((big_idx, big_left))
+            else:
+                alias_probs_[big_idx] = big_left
+                alias_[big_idx] = -1
+
+        if len(bigs):
+            big = bigs.pop(0)
+            alias_probs_[big[0]] = 1.0
+            alias_[big[0]] = -1
+        if len(littles):
+            little = littles.pop(0)
+            alias_probs_[little[0]] = 1.0
+            alias_[little[0]] = -1
+
+        probs = assign(input=np.array(custom_dist).astype('float32'))
+        custom_alias = assign(input=np.array(alias_).astype('int32'))
+        custom_alias_probs = assign(
+            input=np.array(alias_probs_).astype('float32'))
+
+        inputs['CustomDistProbs'] = probs
+        inputs['CustomDistAlias'] = custom_alias
+        inputs['CustomDistAliasProbs'] = custom_alias_probs
         sampler = 2
     else:
         raise Exception("Unsupported sampler type.")
 
+    if num_neg_samples is None:
+        num_neg_samples = 10
+    else:
+        num_neg_samples = int(num_neg_samples)
+
     attrs = {
         'num_total_classes': int(num_total_classes),
         'num_neg_samples': num_neg_samples,
         'seed': seed,
-        'sampler': sampler
+        'sampler': sampler,
+        'is_sparse': is_sparse
     }
 
     helper.append_op(
@@ -4542,27 +4794,43 @@ def hsigmoid(input,
              num_classes,
              param_attr=None,
              bias_attr=None,
-             name=None):
+             name=None,
+             path_table=None,
+             path_code=None,
+             is_custom=False,
+             is_sparse=False):
     """
     The hierarchical sigmoid operator is used to accelerate the training
     process of language model. This operator organizes the classes into a
-    complete binary tree, each leaf node represents a class(a word) and each
+    complete binary tree, or you can use is_custom to pass your own tree to 
+    implement hierarchical. Each leaf node represents a class(a word) and each
     internal node acts as a binary classifier. For each word there's a unique
     path from root to it's leaf node, hsigmoid calculate the cost for each
     internal node on the path, and sum them to get a total cost. hsigmoid can
     achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
     represents the size of word dict.
 
-    Refer to `Hierarchical Probabilistic Neural Network Language Model
+    Using default tree you can Refer to `Hierarchical Probabilistic Neural Network Language Model
     <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
 
+    And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first:
+        1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
+        2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
+        3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
+         means label of each binary classification, using 1 indicate true, 0 indicate false.
+        4. now, each word should has its path and code along the path, you can pass a batch of path and code 
+        related to the same batch of inputs.
+
+
     Args:
         input (Variable): The input tensor variable with shape
             :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
             and :math:`D` is the feature size.
         label (Variable): The tensor variable contains labels of training data.
             It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2.
+        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, 
+            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num 
+            which indicates the num of classes using by binary classify.
         param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
              of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
              will create ParamAttr as param_attr. If the Initializer of the param_attr
@@ -4574,9 +4842,19 @@ def hsigmoid(input,
              is not set, the bias is initialized zero. Default: None.
         name (str|None): A name for this layer(optional). If set None, the layer
              will be named automatically. Default: None.
+        path_table: (Variable|None) this variable can store each batch of samples' path to root, 
+            it should be in leaf -> root order
+            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like 
+            structure and each element in this array is indexes in parent nodes' Weight Matrix. 
+        path_code:  (Variable|None) this variable can store each batch of samples' code, 
+            each code consist with every code of parent nodes. it should be in leaf -> root order
+        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is 
+             set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
+        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient 
+             of W and input will be sparse.
 
     Returns:
-        Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
+        Out: (LodTensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
 
     Examples:
 
@@ -4592,27 +4870,62 @@ def hsigmoid(input,
     out = helper.create_variable_for_type_inference(dtype)
     pre_out = helper.create_variable_for_type_inference(dtype)
     dim = input.shape[1]
-    if num_classes < 2:
-        raise ValueError("num_classes must not be less than 2.")
-    weights = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[num_classes - 1, dim],
-        is_bias=False,
-        dtype=input.dtype)
-    inputs = {"X": input, "W": weights, "Label": label}
-    if helper.bias_attr:
-        bias = helper.create_parameter(
-            attr=helper.bias_attr,
-            shape=[1, num_classes - 1],
-            is_bias=True,
+    if ((num_classes is None) or (num_classes < 2)) and (not is_custom):
+        raise ValueError(
+            "num_classes must not be less than 2 with default tree")
+
+    if (is_custom) and (path_code is None):
+        raise ValueError("path_code should not be None with costum tree")
+    elif (is_custom) and (path_table is None):
+        raise ValueError("path_table should not be None with costum tree")
+    elif (is_custom) and (num_classes is None):
+        raise ValueError("num_classes should not be None with costum tree")
+    else:
+        pass
+
+    weights = None
+
+    if not is_custom:
+        weights = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=[num_classes - 1, dim],
+            is_bias=False,
             dtype=input.dtype)
-        inputs['Bias'] = bias
+    else:
+        weights = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=[num_classes, dim],
+            is_bias=False,
+            dtype=input.dtype)
+    inputs = {
+        "X": input,
+        "W": weights,
+        "PTable": path_table,
+        "PathCode": path_code,
+        "Label": label
+    }
+    if helper.bias_attr:
+        if not is_custom:
+            bias = helper.create_parameter(
+                attr=helper.bias_attr,
+                shape=[num_classes - 1, 1],
+                is_bias=True,
+                dtype=input.dtype)
+            inputs['Bias'] = bias
+        else:
+            bias = helper.create_parameter(
+                attr=helper.bias_attr,
+                shape=[num_classes, 1],
+                is_bias=True,
+                dtype=input.dtype)
+            inputs['Bias'] = bias
     helper.append_op(
         type="hierarchical_sigmoid",
         inputs=inputs,
         outputs={"Out": out,
                  "PreOut": pre_out},
-        attrs={"num_classes": num_classes})
+        attrs={"num_classes": num_classes,
+               "is_sparse": is_sparse})
     return out
 
 
@@ -4874,7 +5187,7 @@ def multiplex(inputs, index):
 def softmax_with_cross_entropy(logits,
                                label,
                                soft_label=False,
-                               ignore_index=-100,
+                               ignore_index=kIgnoreIndex,
                                numeric_stable_mode=False,
                                return_softmax=False):
     """
@@ -4932,7 +5245,7 @@ def softmax_with_cross_entropy(logits,
             labels as soft labels. By default, `soft_label` is set to False.
         ignore_index (int): Specifies a target value that is ignored and does
                             not contribute to the input gradient. Only valid
-                            if soft_label is set to False. Default: -100
+                            if soft_label is set to False. Default: kIgnoreIndex
         numeric_stable_mode (bool): A flag to indicate whether to use a more
                                     numerically stable algorithm. Only valid
                                     when soft_label is False and GPU is used.
@@ -5870,9 +6183,10 @@ def image_resize(input,
         raise ValueError(
             "The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
         )
+    resample_type = resample_methods[resample]
     if out_shape is None and scale is None:
         raise ValueError("One of out_shape and scale must not be None.")
-    helper = LayerHelper('interpolate', **locals())
+    helper = LayerHelper('{}_interp'.format(resample_type), **locals())
     dtype = helper.input_dtype()
 
     def _is_list_or_turple_(data):
@@ -5906,18 +6220,16 @@ def image_resize(input,
 
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
-        type='interpolate',
+        type='{}_interp'.format(resample_type),
         inputs=inputs,
         outputs={"Out": out},
-        attrs={
-            "out_h": out_h,
-            "out_w": out_w,
-            "interp_method": resample_methods[resample]
-        })
+        attrs={"out_h": out_h,
+               "out_w": out_w,
+               "interp_method": resample_type})
     return out
 
 
-@templatedoc(op_type="interpolate")
+@templatedoc(op_type="bilinear_interp")
 def resize_bilinear(input,
                     out_shape=None,
                     scale=None,
@@ -5973,7 +6285,7 @@ def resize_bilinear(input,
     return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape)
 
 
-@templatedoc(op_type="interpolate")
+@templatedoc(op_type="nearest_interp")
 def resize_nearest(input,
                    out_shape=None,
                    scale=None,
@@ -6475,7 +6787,7 @@ def crop(x, shape=None, offsets=None, name=None):
     helper = LayerHelper('crop', **locals())
 
     if not (isinstance(shape, list) or isinstance(shape, tuple) or \
-                    isinstance(shape, Variable)):
+            isinstance(shape, Variable)):
         raise ValueError("The shape should be a list, tuple or Variable.")
 
     if offsets is None:
@@ -6597,7 +6909,7 @@ def affine_grid(theta, out_shape, name=None):
     helper = LayerHelper('affine_grid')
 
     if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
-        isinstance(out_shape, Variable)):
+            isinstance(out_shape, Variable)):
         raise ValueError("The out_shape should be a list, tuple or Variable.")
 
     if not isinstance(theta, Variable):
@@ -6787,7 +7099,7 @@ def pad2d(input,
 
     Args:
         input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
-        paddings (tuple|list): The padding size. If padding is a tuple, it must
+        paddings (tuple|list|Variable): The padding size. If padding is a tuple, it must
             contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
             Default: padding = [0, 0, 0, 0].
         mode (str): Three modes: constant(default), reflect, edge. Default: constant
@@ -6812,16 +7124,17 @@ def pad2d(input,
     helper = LayerHelper('pad2d', **locals())
     dtype = helper.input_dtype(input_param_name='input')
     out = helper.create_variable_for_type_inference(dtype)
+    inputs = {'X': input}
+    attrs = {'mode': mode, 'pad_value': pad_value, 'data_format': data_format}
+
+    if isinstance(paddings, Variable):
+        inputs['Paddings'] = paddings
+        attrs['paddings'] = []
+    else:
+        attrs['paddings'] = paddings
+
     helper.append_op(
-        type='pad2d',
-        inputs={'X': input},
-        outputs={"Out": out},
-        attrs={
-            'paddings': paddings,
-            'mode': mode,
-            'pad_value': pad_value,
-            'data_frmat': data_format
-        })
+        type='pad2d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
 
     return out
 
@@ -6838,6 +7151,13 @@ def elu(x, alpha=1.0, name=None):
 
     Returns:
         output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.elu(x, alpha=0.2)
     """
     helper = LayerHelper('elu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -6861,6 +7181,13 @@ def relu6(x, threshold=6.0, name=None):
 
     Returns:
         output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.relu6(x, threshold=6.0)
     """
     helper = LayerHelper('relu6', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -6884,6 +7211,13 @@ def pow(x, factor=1.0, name=None):
 
     Returns:
         output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.pow(x, factor=2.0)
     """
     helper = LayerHelper('pow', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -6908,6 +7242,13 @@ def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None):
 
     Returns:
         output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.stanh(x, scale_a=0.67, scale_b=1.72)
     """
     helper = LayerHelper('stanh', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -6933,6 +7274,13 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
 
     Returns:
         output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.hard_sigmoid(x, slope=0.3, offset=0.8)
     """
     helper = LayerHelper('hard_sigmoid', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -6957,6 +7305,13 @@ def swish(x, beta=1.0, name=None):
 
     Returns:
         output(${out_type}): ${out_comment}
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
+            y = fluid.layers.swish(x, beta=2.0)
     """
     helper = LayerHelper('swish', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7427,6 +7782,11 @@ def uniform_random_batch_size_like(input,
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+            out = layers.uniform_random_batch_size_like(input, [-1, 11])
     """
 
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
@@ -7464,6 +7824,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            out = layers.gaussian_random(shape=[20, 30])
     """
 
     helper = LayerHelper('gaussian_random', **locals())
@@ -7499,6 +7863,16 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            x = layers.data(
+                name="X",
+                shape=[13, 11],
+                dtype='float32',
+                append_batch_size=False)
+
+            out = layers.sampling_id(x)
     """
 
     helper = LayerHelper('sampling_id', **locals())
@@ -7538,6 +7912,14 @@ def gaussian_random_batch_size_like(input,
 
     Returns:
         out (Variable): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+
+            out = layers.gaussian_random_batch_size_like(
+                input, shape=[-1, 11], mean=1.0, std=2.0)
     """
 
     helper = LayerHelper('gaussian_random_batch_size_like', **locals())
@@ -7570,6 +7952,12 @@ def sum(x):
 
     Returns:
         out (Variable): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+            out = layers.sum(input)
     """
 
     helper = LayerHelper('sum', **locals())
@@ -7598,6 +7986,17 @@ def slice(input, axes, starts, ends):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            starts = [1, 0, 2]
+            ends = [3, 3, 4]
+            axes = [0, 1, 2]
+
+            input = layers.data(
+                name="input", shape=[3, 4, 5, 6], dtype='float32')
+
+            out = layers.slice(input, axes=axes, starts=starts, ends=ends)
     """
 
     helper = LayerHelper('slice', **locals())
@@ -7625,6 +8024,12 @@ def shape(input):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.shape(input)
     """
 
     helper = LayerHelper('shape', **locals())
@@ -8012,13 +8417,17 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 
 
 @templatedoc()
-def sigmoid_cross_entropy_with_logits(x, label, name=None):
+def sigmoid_cross_entropy_with_logits(x,
+                                      label,
+                                      ignore_index=kIgnoreIndex,
+                                      name=None):
     """
     ${comment}
 
     Args:
         x(${x_type}): ${x_comment}
         label(${label_type}): ${label_comment}
+        ignore_index(&{ignore_index}): ${ignore_index_comment}
         name(basestring|None): Name of the output.
 
     Returns:
@@ -8037,7 +8446,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None):
         type="sigmoid_cross_entropy_with_logits",
         inputs={"X": x,
                 "Label": label},
-        attrs={},
+        attrs={"ignore_index": ignore_index},
         outputs={"Out": out})
     return out
 
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index f65b37903a..829154f1b2 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -46,8 +46,8 @@ def _is_numpy_(var):
 
 
 def _is_number_(var):
-    return isinstance(var, int) or isinstance(var, float) or (isinstance(
-        var, np.ndarray) and var.shape == (1, ))
+    return isinstance(var, int) or isinstance(var, np.int64) or isinstance(
+        var, float) or (isinstance(var, np.ndarray) and var.shape == (1, ))
 
 
 def _is_number_or_matrix_(var):
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 3f4dd5eb71..bdcd045341 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -124,16 +124,11 @@ class ParallelExecutor(object):
                     os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
                 exec_strategy.num_threads = cpu_num * 2
 
-        # Set 1 thread num under nccl2 distribute 
-        #   env to make sure all gpus run ops in same order.
-        if num_trainers > 1:
-            assert (use_cuda)
-            # FIXME(gongwb): avoid this set.
-            exec_strategy.num_threads = 1
-
         if build_strategy is None:
             build_strategy = BuildStrategy()
 
+        build_strategy.num_trainers = num_trainers
+
         main = main_program
         main = main if main else framework.default_main_program()
         if scope == None:
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
index ad056aaa7b..f9c6d60540 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
@@ -10,6 +10,8 @@ else()
     foreach(src ${TEST_OPS})
         if(${src} STREQUAL "test_recognize_digits_conv")
             message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_recognize_digits_mlp")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
         else()
             py_test(${src} SRCS ${src}.py)
         endif()
diff --git a/python/paddle/fluid/tests/demo/async_executor.py b/python/paddle/fluid/tests/demo/async_executor.py
new file mode 100644
index 0000000000..fe8da0aab7
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/async_executor.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tarfile
+import paddle.fluid as fluid
+import paddle
+from paddle.fluid import core
+
+URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
+MD5 = '2a405a31508969b3ab823f42c0f522ca'
+
+
+def bow_net(data,
+            label,
+            dict_dim=89528,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    models/fluid/PaddleNLP/text_classification/nets.py
+    """
+    # embedding
+    emb = fluid.layers.embedding(
+        input=data, size=[dict_dim, emb_dim], is_sparse=True)
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bowh = fluid.layers.tanh(bow)
+    # fc layer after conv
+    fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    # probability of each class
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    # cross entropy loss
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    # mean loss
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, acc, prediction
+
+
+def train():
+    # Download data
+    with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf:
+        tarf.extractall(path='./')
+        tarf.close()
+
+    # Initialize dataset description
+    dataset = fluid.DataFeedDesc('train_data/data.prototxt')
+    dataset.set_batch_size(128)  # See API doc for how to change other fields
+    print dataset.desc()  # Debug purpose: see what we get
+
+    # define network
+    # input text data
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    # label data
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    avg_cost, acc, prediction = bow_net(data, label)
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
+    opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
+
+    # Run startup program
+    startup_program = fluid.default_startup_program()
+    place = fluid.CPUPlace()
+    executor = fluid.Executor(place)
+    executor.run(startup_program)
+
+    async_executor = fluid.AsyncExecutor(place)
+    main_program = fluid.default_main_program()
+    epochs = 10
+    filelist = ["train_data/part-%d" % i for i in range(12)]
+    for i in range(epochs):
+        thread_num = 4
+        async_executor.run(
+            main_program,  # This can be changed during iteration
+            dataset,  # This can be changed during iteration
+            filelist,  # This can be changed during iteration
+            thread_num,  # This can be changed during iteration
+            [data, acc],  # Multiple fetch targets can be specified
+            debug=False)
+        fluid.io.save_inference_model('imdb/epoch%d.model' % i,
+                                      [data.name, label.name], [acc], executor)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index a2eca5541a..d99eaa0634 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -388,5 +388,18 @@ class TestGenerateProposals(unittest.TestCase):
         print(rpn_rois.shape)
 
 
+class TestYoloDetection(unittest.TestCase):
+    def test_yolov3_loss(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
+            gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
+            gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
+            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10,
+                                      0.5)
+
+            self.assertIsNotNone(loss)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 4fa69191ad..26035f303e 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -63,7 +63,7 @@ function(py_test_modules TARGET_NAME)
     set(multiValueArgs MODULES DEPS ENVS)
     cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
+             COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
              ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (py_test_modules_SERIAL)
@@ -81,25 +81,27 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
-list(REMOVE_ITEM TEST_OPS test_interpolate_op)
+list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
+list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
-py_test_modules(test_interpolate_op MODULES test_interpolate_op SERIAL)
+py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
+py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
     if(NOT APPLE)
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
-        py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
-        set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
-        # FIXME(typhoonzero): add this back
-	#py_test_modules(test_dist_transformer MODULES test_dist_transformer)
-	#set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
+        # FIXME(typhoonzero): add these tests back
+	# py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
+	# set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
+	# py_test_modules(test_dist_transformer MODULES test_dist_transformer)
+	# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
         # TODO(typhoonzero): make dist test parallel when fix port management issue
-        set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_se_resnext test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE)
+        set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE)
     endif(NOT APPLE)
     py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
index 902dc6544e..6596982433 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -16,11 +16,13 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import os
 
 import dist_ctr_reader
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 IS_SPARSE = True
+os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
 
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 271b9c740f..76a707efdc 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -216,6 +216,15 @@ class OpTest(unittest.TestCase):
                                      self.dtype)
         outputs = append_input_output(block, op_proto, self.outputs, False,
                                       self.dtype)
+
+        if hasattr(self, "cache_name_list"):
+            for name in self.cache_name_list:
+                inputs[name] = block.create_var(
+                    name=name,
+                    persistable=True,
+                    type=core.VarDesc.VarType.RAW,
+                    stop_gradient=True)
+
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -428,8 +437,17 @@ class OpTest(unittest.TestCase):
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
         op_outputs = self.outputs if hasattr(self, "outputs") else dict()
         op_attrs = self.attrs if hasattr(self, "attrs") else dict()
-        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
-                            op_attrs)
+
+        cache_list = None
+        if hasattr(self, "cache_name_list"):
+            cache_list = self.cache_name_list
+        self.op = create_op(
+            self.scope,
+            self.op_type,
+            op_inputs,
+            op_outputs,
+            op_attrs,
+            cache_list=cache_list)
 
         if no_grad_set is None:
             no_grad_set = set()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index ad7591417e..55c43ef115 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -18,7 +18,7 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
-from scipy.special import expit
+from scipy.special import expit, erf
 
 
 class TestActivation(OpTest):
@@ -295,6 +295,23 @@ class TestRelu(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+class TestGelu(TestActivation):
+    def setUp(self):
+        self.op_type = "gelu"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = 0.5 * x * (1.0 + erf(x / np.sqrt(2.0)))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
 class TestBRelu(TestActivation):
     def setUp(self):
         self.op_type = "brelu"
@@ -628,6 +645,7 @@ create_test_act_fp16_class(TestCos, grad_atol=0.85)
 create_test_act_fp16_class(TestSin)
 create_test_act_fp16_class(TestRound, grad_check=False)
 create_test_act_fp16_class(TestRelu)
+create_test_act_fp16_class(TestGelu)
 create_test_act_fp16_class(TestBRelu)
 create_test_act_fp16_class(TestRelu6)
 create_test_act_fp16_class(TestSoftRelu)
diff --git a/python/paddle/fluid/tests/unittests/test_async_executor.py b/python/paddle/fluid/tests/unittests/test_async_executor.py
new file mode 100644
index 0000000000..43855b95f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_async_executor.py
@@ -0,0 +1,142 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle
+import unittest
+import tarfile
+import os
+import shutil
+
+proto_str = ('name: "MultiSlotDataFeed"\n'
+             'batch_size: 2\n'
+             'multi_slot_desc {\n'
+             '   slots {\n'
+             '       name: "words"\n'
+             '       type: "uint64"\n'
+             '       is_dense: false\n'
+             '       is_used: true\n'
+             '   }\n'
+             '   slots {\n'
+             '       name: "label"\n'
+             '       type: "uint64"\n'
+             '       is_dense: false\n'
+             '       is_used: true\n'
+             '   }\n'
+             '}')
+
+URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
+MD5 = '2a405a31508969b3ab823f42c0f522ca'
+
+
+def bow_net(data,
+            label,
+            dict_dim=89528,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    models/fluid/PaddleNLP/text_classification/nets.py
+    """
+    # embedding
+    emb = fluid.layers.embedding(
+        input=data, size=[dict_dim, emb_dim], is_sparse=True)
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bowh = fluid.layers.tanh(bow)
+    # fc layer after conv
+    fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    # probability of each class
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    # cross entropy loss
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    # mean loss
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, acc, prediction
+
+
+class TestAsyncExecutor(unittest.TestCase):
+    def setUp(self):
+        with open('./data.prototxt', 'w+') as f:
+            f.write(proto_str)
+            f.close()
+
+        with tarfile.open(paddle.dataset.common.download(URL, "imdb",
+                                                         MD5)) as tarf:
+            tarf.extractall(path='./')
+            tarf.close()
+
+    def test_data_feed_desc(self):
+        data_feed = fluid.DataFeedDesc('./data.prototxt')
+        # assertEqueal(data_feed.proto_desc.batch, 2)
+        # assertEqual(len(data_feed.proto_desc.multi_slot_desc), 2)
+        self.assertEqual(" ".join(data_feed.desc().split()),
+                         " ".join(proto_str.split()))
+
+    def test_run(self):
+        # Initialize dataset description
+        data_feed = fluid.DataFeedDesc('train_data/data.prototxt')
+        data_feed.set_batch_size(
+            128)  # See API doc for how to change other fields
+
+        # define network
+        # input text data
+        data = fluid.layers.data(
+            name="words", shape=[1], dtype="int64", lod_level=1)
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+        avg_cost, acc, prediction = bow_net(data, label)
+        sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
+        opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
+
+        # Run startup program
+        startup_program = fluid.default_startup_program()
+        place = fluid.CPUPlace()
+        executor = fluid.Executor(place)
+        executor.run(startup_program)
+
+        main_program = fluid.default_main_program()
+        async_executor = fluid.AsyncExecutor(place)
+
+        self.assertRaises(TypeError, async_executor.run)
+        self.assertRaises(TypeError, async_executor.run, main_program)
+        self.assertRaises(TypeError, async_executor.run, main_program,
+                          data_feed)
+
+        filelist = ['train_data/part-%d' % i for i in range(10)]
+        self.assertRaises(TypeError, async_executor.run, main_program,
+                          data_feed, filelist)
+
+        thread_num = 4
+        self.assertRaises(TypeError, async_executor.run, main_program,
+                          data_feed, filelist, thread_num)
+
+        async_executor.run(main_program, data_feed, filelist, thread_num, [acc])
+        fluid.io.save_inference_model("imdb.model", [data.name, label.name],
+                                      [acc], executor)
+        statinfo = os.stat('imdb.model/__model__')
+        self.assertGreater(statinfo.st_size, 0)
+
+        os.remove('./data.prototxt')
+        shutil.rmtree('./train_data')
+        shutil.rmtree('./imdb.model')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 80261eff4e..2869a6ba53 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -54,6 +54,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
     return y
 
 
+def _cal_mean_variance(x, epsilon, data_format):
+    assert data_format in ['NCHW', 'NHWC']
+    x_square = x * x
+    axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2)
+    C = x.shape[1] if data_format == 'NCHW' else x.shape[-1]
+    x_square_sum = np.sum(x_square, axis)
+    x_sum = np.sum(x, axis=axis)
+    element_count = np.size(x) / C
+    mean = x_sum / element_count
+    var = x_square_sum / element_count - mean * mean
+    return mean, var
+
+
 def _reference_training(x, scale, offset, epsilon, data_format):
     x_shape = x.shape
 
@@ -294,7 +307,18 @@ class TestBatchNormOpTraining(unittest.TestCase):
         self.use_mkldnn = False
         self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
+        self.momentum = 0.9
+        self.epsilon = 0.00001
         self.init_kernel_type()
+        self.init_test_case()
+
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD',
+            'scale@GRAD', 'bias@GRAD'
+        ]
 
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         np.allclose(np.array(tensor), np_array, atol=atol)
@@ -313,11 +337,22 @@ class TestBatchNormOpTraining(unittest.TestCase):
 
         return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
 
+    def set_mean_variance(self, scale_shape, x, data_layout):
+        mean = np.zeros(scale_shape).astype(np.float32)
+        variance = np.ones(scale_shape).astype(np.float32)
+        # computing global mean/variance for one step
+        if self.use_global_stats:
+            mom = self.momentum
+            x_mean, x_var = _cal_mean_variance(x, self.epsilon, data_layout)
+            mean = x_mean * (1. - mom) + mom * mean
+            variance = x_var * (1. - mom) + mom * variance
+        return mean, variance
+
     def test_forward_backward(self):
         def test_with_place(place, data_layout, shape):
             # attr
-            epsilon = 0.00001
-            momentum = 0.9
+            epsilon = self.epsilon
+            momentum = self.momentum
             if data_layout == "NCHW":
                 n, c, h, w = shape[0], shape[1], shape[2], shape[3]
             else:
@@ -328,9 +363,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
             x = np.random.random_sample(shape).astype(np.float32)
             scale = np.random.random_sample(scale_shape).astype(np.float32)
             bias = np.random.random_sample(scale_shape).astype(np.float32)
-            mean = np.zeros(scale_shape).astype(np.float32)
-            variance = np.ones(scale_shape).astype(np.float32)
-
+            mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
             y_grad = np.random.random_sample(shape).astype(np.float32)
 
             y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
@@ -339,6 +372,9 @@ class TestBatchNormOpTraining(unittest.TestCase):
 
             var_dict = locals()
             var_dict['y@GRAD'] = y_grad
+            var_dict['x@GRAD'] = x_grad
+            var_dict['scale@GRAD'] = scale_grad
+            var_dict['bias@GRAD'] = bias_grad
 
             var_names = [
                 'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
@@ -365,9 +401,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
                     },
                     outputs={
                         "Y": block.var('y'),
-                        "MeanOut": block.var('mean'),  # share the same memory
-                        "VarianceOut":
-                        block.var('variance'),  # share the same memory
+                        "MeanOut": block.var('mean'),  # share memory
+                        "VarianceOut": block.var('variance'),  # share memory
                         "SavedMean": block.var('saved_mean'),
                         "SavedVariance": block.var('saved_variance')
                     },
@@ -377,13 +412,14 @@ class TestBatchNormOpTraining(unittest.TestCase):
                         "is_test": False,
                         "data_layout": data_layout,
                         "use_mkldnn": self.use_mkldnn,
-                        "fuse_with_relu": self.fuse_with_relu
+                        "fuse_with_relu": self.fuse_with_relu,
+                        "use_global_stats": self.use_global_stats
                     })
                 block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
 
                 # generate backward op_desc
                 grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    bn_op.desc, set(), [])
+                    bn_op.desc, self.no_grad_set, [])
                 grad_op_desc = grad_op_desc_list[0]
                 new_op_desc = block.desc.append_op()
                 new_op_desc.copy_from(grad_op_desc)
@@ -403,20 +439,10 @@ class TestBatchNormOpTraining(unittest.TestCase):
                         for name in
                         ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
                     },
-                    fetch_list=[
-                        'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
-                        'x@GRAD', 'scale@GRAD', 'bias@GRAD'
-                    ])
-
-            self.__assert_close(y, out[0], "y")
-            self.__assert_close(mean_out, out[1], "mean")
-            self.__assert_close(variance_out, out[2], "variance", 1e-3)
-            self.__assert_close(saved_mean, out[3], "saved_mean")
-            self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
-            self.__assert_close(x_grad, out[5], "x_grad")
-            self.__assert_close(scale_grad, out[6], "scale_grad")
-            self.__assert_close(bias_grad, out[7], "bias_grad")
+                    fetch_list=self.fetch_list)
 
+            for id, name in enumerate(self.fetch_list):
+                self.__assert_close(var_dict[name], out[id], name)
             print("op test forward passed: ", str(place), data_layout)
 
         places = [core.CPUPlace()]
@@ -432,5 +458,66 @@ class TestBatchNormOpTraining(unittest.TestCase):
         pass
 
 
+class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = True
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'x@GRAD', 'scale@GRAD', 'bias@GRAD'
+        ]
+
+    def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
+        if data_format == "NCHW":
+            x = np.transpose(x, (0, 2, 3, 1))
+            y_grad = np.transpose(y_grad, (0, 2, 3, 1))
+
+        x_grad = scale * y_grad / np.sqrt(var + epsilon)
+        grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
+                            axis=(0, 1, 2))
+        grad_offset = np.sum(y_grad, axis=(0, 1, 2))
+
+        # transfer back to N, C, H, W
+        if data_format == "NCHW":
+            x_grad = np.transpose(x_grad, (0, 3, 1, 2))
+            x = np.transpose(x, (0, 3, 1, 2))
+            y_grad = np.transpose(y_grad, (0, 3, 1, 2))
+
+        return x_grad, grad_scale, grad_offset
+
+    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
+                             epsilon, momentum, shape, data_layout):
+        if data_layout != "NCHW" and data_layout != "NHWC":
+            raise ValueError("Unknown data order.")
+
+        if data_layout == "NCHW":
+            x = np.transpose(x, (0, 2, 3, 1))
+
+        # run normalizaton
+        normalized = (x - mean) / np.sqrt(variance + epsilon)
+        y = normalized * scale + bias
+
+        # transfer back to N, C, H, W
+        if data_layout == "NCHW":
+            x = np.transpose(x, (0, 3, 1, 2))
+            y = np.transpose(y, (0, 3, 1, 2))
+
+        mean_out = mean
+        variance_out = variance
+        saved_variance = 1. / np.sqrt(variance + epsilon)
+        # run backward
+        x_grad, scale_grad, bias_grad = self.reference_grad(
+            x, y_grad, scale, mean, variance, epsilon, data_layout)
+
+        return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad
+
+
+class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
+        TestBatchNormOpFreezeStatsTraining):
+    def init_test_case(self):
+        self.use_global_stats = True
+        self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
+        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_interpolate_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
similarity index 53%
rename from python/paddle/fluid/tests/unittests/test_interpolate_op.py
rename to python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index 9748d094cd..c8a7063dc1 100644
--- a/python/paddle/fluid/tests/unittests/test_interpolate_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -20,36 +20,6 @@ from op_test import OpTest
 import paddle.fluid.core as core
 
 
-def nearest_neighbor_interp_np(X,
-                               out_h,
-                               out_w,
-                               out_size=None,
-                               actual_shape=None):
-    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    n, c, in_h, in_w = X.shape
-
-    ratio_h = ratio_w = 0.0
-    if out_h > 1:
-        ratio_h = (in_h - 1.0) / (out_h - 1.0)
-    if out_w > 1:
-        ratio_w = (in_w - 1.0) / (out_w - 1.0)
-
-    out = np.zeros((n, c, out_h, out_w))
-    for i in range(out_h):
-        in_i = int(ratio_h * i + 0.5)
-        for j in range(out_w):
-            in_j = int(ratio_w * j + 0.5)
-            out[:, :, i, j] = X[:, :, in_i, in_j]
-
-    return out.astype(X.dtype)
-
-
 def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
     """bilinear interpolation implement in shape [N, C, H, W]"""
     if out_size is not None:
@@ -87,22 +57,16 @@ def bilinear_interp_np(input, out_h, out_w, out_size=None, actual_shape=None):
     return out.astype(input.dtype)
 
 
-INTERPOLATE_FUNCS = {
-    'bilinear': bilinear_interp_np,
-    'nearest': nearest_neighbor_interp_np,
-}
-
-
-class TestInterpolateOp(OpTest):
+class TestBilinearInterpOp(OpTest):
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
         self.init_test_case()
-        self.op_type = "interpolate"
+        self.op_type = "bilinear_interp"
         input_np = np.random.random(self.input_shape).astype("float32")
 
-        output_np = INTERPOLATE_FUNCS[self.interp_method](
-            input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size, self.actual_shape)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -129,7 +93,7 @@ class TestInterpolateOp(OpTest):
         self.out_size = np.array([3, 3]).astype("int32")
 
 
-class TestBilinearInterpCase1(TestInterpolateOp):
+class TestBilinearInterpCase1(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -137,7 +101,7 @@ class TestBilinearInterpCase1(TestInterpolateOp):
         self.out_w = 1
 
 
-class TestBilinearInterpCase2(TestInterpolateOp):
+class TestBilinearInterpCase2(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 3, 9, 6]
@@ -145,7 +109,7 @@ class TestBilinearInterpCase2(TestInterpolateOp):
         self.out_w = 12
 
 
-class TestBilinearInterpCase3(TestInterpolateOp):
+class TestBilinearInterpCase3(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [1, 1, 128, 64]
@@ -153,7 +117,7 @@ class TestBilinearInterpCase3(TestInterpolateOp):
         self.out_w = 128
 
 
-class TestBilinearInterpCase4(TestInterpolateOp):
+class TestBilinearInterpCase4(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -162,7 +126,7 @@ class TestBilinearInterpCase4(TestInterpolateOp):
         self.out_size = np.array([2, 2]).astype("int32")
 
 
-class TestBilinearInterpCase5(TestInterpolateOp):
+class TestBilinearInterpCase5(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 3, 9, 6]
@@ -171,7 +135,7 @@ class TestBilinearInterpCase5(TestInterpolateOp):
         self.out_size = np.array([11, 11]).astype("int32")
 
 
-class TestBilinearInterpCase6(TestInterpolateOp):
+class TestBilinearInterpCase6(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [1, 1, 128, 64]
@@ -180,7 +144,7 @@ class TestBilinearInterpCase6(TestInterpolateOp):
         self.out_size = np.array([65, 129]).astype("int32")
 
 
-class TestBilinearInterpActualShape(TestInterpolateOp):
+class TestBilinearInterpActualShape(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [3, 2, 32, 16]
@@ -189,25 +153,16 @@ class TestBilinearInterpActualShape(TestInterpolateOp):
         self.out_size = np.array([66, 40]).astype("int32")
 
 
-class TestBilinearInterpBigScale(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 4, 64, 32]
-        self.out_h = 100
-        self.out_w = 50
-        self.out_size = np.array([101, 51]).astype('int32')
-
-
-class TestInterpolateOpUint8(OpTest):
+class TestBilinearInterpOpUint8(OpTest):
     def setUp(self):
         self.out_size = None
         self.actual_shape = None
         self.init_test_case()
-        self.op_type = "interpolate"
+        self.op_type = "bilinear_interp"
         input_np = np.random.randint(
             low=0, high=256, size=self.input_shape).astype("uint8")
-        output_np = INTERPOLATE_FUNCS[self.interp_method](
-            input_np, self.out_h, self.out_w, self.out_size, self.actual_shape)
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size, self.actual_shape)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -228,7 +183,7 @@ class TestInterpolateOpUint8(OpTest):
         self.out_w = 9
 
 
-class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
+class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [2, 3, 128, 64]
@@ -236,7 +191,7 @@ class TestBilinearInterpCase1Uint8(TestInterpolateOpUint8):
         self.out_w = 50
 
 
-class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
+class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
     def init_test_case(self):
         self.interp_method = 'bilinear'
         self.input_shape = [4, 1, 7, 8]
@@ -245,91 +200,5 @@ class TestBilinearInterpCase2Uint8(TestInterpolateOpUint8):
         self.out_size = np.array([6, 15]).astype("int32")
 
 
-class TestNearestNeighborInterpCase1(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-
-
-class TestNearestNeighborInterpCase2(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-
-
-class TestNearestNeighborInterpCase3(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-
-
-class TestNearestNeighborInterpCase4(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.out_size = np.array([2, 2]).astype("int32")
-
-
-class TestNearestNeighborInterpCase5(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.out_size = np.array([11, 11]).astype("int32")
-
-
-class TestNearestNeighborInterpCase6(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 128, 64]
-        self.out_h = 64
-        self.out_w = 128
-        self.out_size = np.array([65, 129]).astype("int32")
-
-
-class TestNearestNeighborInterpActualShape(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.out_size = np.array([66, 40]).astype("int32")
-
-
-class TestNearestNeighborInterpBigScale(TestInterpolateOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 4, 64, 32]
-        self.out_h = 100
-        self.out_w = 50
-        self.out_size = np.array([101, 51]).astype('int32')
-
-
-class TestNearestNeighborInterpCase1Uint8(TestInterpolateOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 128, 64]
-        self.out_h = 120
-        self.out_w = 50
-
-
-class TestNearestNeighborInterpCase2Uint8(TestInterpolateOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 5
-        self.out_w = 13
-        self.out_size = np.array([6, 15]).astype("int32")
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
new file mode 100644
index 0000000000..deefdd09ab
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+from test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
+
+
+class TestMKLDNN(TestConv2dTransposeOp):
+    def init_op_type(self):
+        self.is_test = True
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+        self.op_type = "conv2d_transpose"
+        self._cpu_only = True
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+
+class TestMKLDNNWithPad(TestWithPad):
+    def init_op_type(self):
+        self.is_test = True
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+        self.op_type = "conv2d_transpose"
+        self._cpu_only = True
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+
+class TestMKLDNNWithStride(TestWithStride):
+    def init_op_type(self):
+        self.is_test = True
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+        self.op_type = "conv2d_transpose"
+        self._cpu_only = True
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 5bb769b168..3b820f6ad7 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -68,8 +68,11 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
 class TestConv2dTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
+        self.is_test = False
         self.use_cudnn = False
+        self.use_mkldnn = False
         self.output_size = None
+        self.data_format = "AnyLayout"
         self.init_op_type()
         self.init_test_case()
 
@@ -83,7 +86,9 @@ class TestConv2dTransposeOp(OpTest):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+            'is_test': self.is_test,
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format
         }
         if self.output_size is not None:
             self.attrs['output_size'] = self.output_size
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index d132dd3c48..194387bc98 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -782,5 +782,46 @@ class TestNCCL2Transpile(TranspilerTest):
             pass
 
 
+# test for remote prefetch
+class TestRemoteLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        import os
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 4)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+
+        # 3 optimize for table 2 adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+
+        trainer, _ = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
+            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'split_selected_rows', 'send', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
+            'recv', 'fetch_barrier'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 3191eb94d7..48fb93ec52 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -172,6 +172,7 @@ class TestDynRNN(unittest.TestCase):
             rnn = fluid.layers.DynamicRNN()
             with rnn.block():
                 in_ = rnn.step_input(sentence)
+                assert in_.lod_level == 1, "the lod level of in_ should be 1"
                 sent_emb = fluid.layers.embedding(
                     input=in_, size=[len(word_dict), 32], dtype='float32')
                 out_ = fluid.layers.fc(input=sent_emb, size=100, act='tanh')
@@ -179,6 +180,7 @@ class TestDynRNN(unittest.TestCase):
                 rnn1 = fluid.layers.DynamicRNN()
                 with rnn1.block():
                     in_1 = rnn1.step_input(out_)
+                    assert in_1.lod_level == 0, "the lod level of in_1 should be 0"
                     out_1 = fluid.layers.fc(input=[in_1], size=100, act='tanh')
                     rnn1.output(out_1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 6948ae3002..2a6c93f75f 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -16,6 +16,8 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
+import paddle.fluid as fluid
 import math
 from op_test import OpTest
 
@@ -40,6 +42,29 @@ class CodeTable(object):
         return self.c & (1 << bit)
 
 
+class CodeTableWithCustomTree(object):
+    def __init__(self, path_table, path_code, index):
+        self.ptable_ = path_table
+        self.pcode_ = path_code
+        self.index_ = index
+
+    def cal_index(self, bit):
+        return self.ptable_[self.index_][bit]
+
+    def get_length(self):
+        length = 0
+        for ele in self.ptable_[self.index_]:  # find the first -1 to stop trace
+
+            if ele >= 0:
+                length = length + 1
+            else:
+                return length
+        return length
+
+    def cal_bit(self, bit):
+        return self.pcode_[self.index_][bit]
+
+
 def hsigmoid(x, w, label, bias, num_classes):
     batch_size = x.shape[0]
     code_length = find_latest_set(num_classes - 1)
@@ -52,7 +77,7 @@ def hsigmoid(x, w, label, bias, num_classes):
         length = code_table.get_length()
         for j in range(length):
             idx = code_table.cal_index(j)
-            pre_output[i][j] += bias[0][idx]
+            pre_output[i][j] += bias[idx][0]
     for i in range(batch_size):
         code_table = CodeTable(num_classes, label[i])
         length = code_table.get_length()
@@ -77,17 +102,58 @@ def hsigmoid(x, w, label, bias, num_classes):
     return pre_output, out
 
 
+def hsigmoidWithCustomTree(x, w, path_table, path_code, label, bias,
+                           num_classes):
+    batch_size = x.shape[0]
+    code_length = len(path_table[0])
+    code_table = [0 for _ in range(code_length)]
+    # init pre_out with shape [N, code_length]
+    pre_output = np.zeros((batch_size, code_length))
+    pre_sum = np.zeros((batch_size, 1))
+    out = np.zeros((batch_size, 1)).astype("float32")
+    if isinstance(bias, np.ndarray):
+        for i in range(batch_size):
+            code_table = CodeTableWithCustomTree(path_table, path_code, i)
+            length = code_table.get_length()
+            for j in range(length):
+                idx = code_table.cal_index(j)
+                pre_output[i][j] += bias[idx][0]
+    for i in range(batch_size):
+        code_table = CodeTableWithCustomTree(path_table, path_code, i)
+        length = code_table.get_length()
+        for j in range(length):
+            idx = code_table.cal_index(j)
+            pre_output[i][j] += np.dot(w[idx], x[i])
+    # clip[-40.0, 40.0]
+    pre_output = np.clip(pre_output, -40.0, 40.0)
+    # out(i, 0) = \sum_j  bit(i, j) * preout(i, j)
+    for i in range(batch_size):
+        code_table = CodeTableWithCustomTree(path_table, path_code, i)
+        length = code_table.get_length()
+        sum = 0.0
+        for j in range(length):
+            if code_table.cal_bit(j):
+                sum += pre_output[i][j]
+        out[i] = -1.0 * sum
+    # soft relu
+    pre_output = np.log(1 + np.exp(pre_output))
+    pre_sum = pre_output.sum(1).reshape((batch_size, 1))
+    out += pre_sum
+    return pre_output, out
+
+
 class TestHSigmoidOp(OpTest):
     def setUp(self):
         self.op_type = "hierarchical_sigmoid"
         num_classes = 6
         feature_size = 8
         batch_size = 4
-        x = np.random.random((batch_size, feature_size)).astype("float32")
-        w = np.random.random((num_classes - 1, feature_size)).astype("float32")
+        x = np.random.random((batch_size, feature_size)).astype("float32") * 2
+        w = np.random.random(
+            (num_classes - 1, feature_size)).astype("float32") * 2
         label = np.random.randint(0, num_classes, (batch_size, 1))
-        bias = np.random.random((1, num_classes - 1)).astype("float32")
-        self.attrs = {'num_classes': num_classes}
+        bias = np.random.random((num_classes - 1, 1)).astype("float32")
+        self.attrs = {'num_classes': num_classes, 'is_sparse': False}
         self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
         pre_output, out = hsigmoid(x, w, label, bias, num_classes)
         self.outputs = {'PreOut': pre_output, 'Out': out}
@@ -99,5 +165,185 @@ class TestHSigmoidOp(OpTest):
         self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
 
 
+class TestHSigmoidOpSparse(OpTest):
+    def setUp(self):
+        self.op_type = "hierarchical_sigmoid"
+        num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
+        feature_size = 8
+        batch_size = 4
+        x = np.random.random((batch_size, feature_size)).astype("float32")
+        w = np.random.random((num_classes - 1, feature_size)).astype("float32")
+        label = np.array([0, 1, 4, 5])
+        path_table = np.array(
+            [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
+             (0, 2, -1, -1,
+              -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
+        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
+            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
+        bias = np.random.random((num_classes - 1, 1)).astype("float32")
+        self.attrs = {'num_classes': num_classes, 'is_sparse': True}
+        self.inputs = {
+            'X': x,
+            'W': w,
+            'PTable': path_table,
+            'PathCode': path_code,
+            'Label': label,
+            'Bias': bias
+        }
+        pre_output, out = hsigmoidWithCustomTree(x, w, path_table, path_code,
+                                                 label, bias, num_classes)
+        self.outputs = {'PreOut': pre_output, 'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
+    def hs_net_conf(self, is_sparse):
+        input_word = fluid.layers.data(name="x", shape=[1], dtype='int64')
+        path_table = fluid.layers.data(
+            name='path_table', shape=[3], dtype='int64')
+        path_code = fluid.layers.data(
+            name='path_code', shape=[3], dtype='int64')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        data_list = [input_word, path_table, path_code, label]
+
+        emb = fluid.layers.embedding(
+            input=input_word,
+            is_sparse=is_sparse,
+            size=[3, 3],
+            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
+                scale=1 / math.sqrt(3))))
+
+        cost = fluid.layers.hsigmoid(
+            input=emb,
+            label=label,
+            bias_attr=True,
+            num_classes=3,
+            path_table=path_table,
+            path_code=path_code,
+            is_custom=True,
+            is_sparse=is_sparse)
+
+        avg_cost = fluid.layers.reduce_mean(cost)
+
+        return avg_cost, data_list
+
+    def training_test(self, is_sparse):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            start_up = fluid.default_startup_program()
+            start_up.random_seed = 1  # Fix random seed
+            x = np.arange(6).reshape(6)
+            path_table = np.array([(1, 2, -1), (1, 2, -1)])
+            path_code = np.array([(1, 0, -1), (0, 0, -1)])
+            label = np.array([1, 4])
+
+            loss, data_list = self.hs_net_conf(is_sparse)
+            optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
+            optimizer.minimize(loss)
+
+            main_program = fluid.default_main_program()
+            place = fluid.CPUPlace()
+            feeder = fluid.DataFeeder(feed_list=data_list, place=place)
+            exe = fluid.Executor(place)
+
+            exe.run(start_up)
+            result = list()
+            for i in range(10):
+                data = [([[x[i % 2]]], [list(path_table[i % 2])],
+                         [list(path_code[i % 2])], [label[i % 2]])]
+
+                loss_val = exe.run(main_program,
+                                   feed=feeder.feed(data),
+                                   fetch_list=[loss])
+                result.append(loss_val)
+        return result
+
+    def test_hs_grad_with_sparse(self):
+        dense_result = self.training_test(is_sparse=False)
+        sparse_result = self.training_test(is_sparse=True)
+        assert (dense_result == sparse_result)
+
+
+class TestHSigmoidOpWithCostumTree(OpTest):
+    def setUp(self):
+        self.op_type = "hierarchical_sigmoid"
+        num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
+        feature_size = 8
+        batch_size = 4
+        x = np.random.random((batch_size, feature_size)).astype("float32") * 2
+        w = np.random.random(
+            (num_classes - 1, feature_size)).astype("float32") * 2
+        label = np.array([0, 1, 4, 5])
+        path_table = np.array(
+            [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
+             (0, 2, -1, -1,
+              -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
+        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
+            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
+        bias = np.random.random((num_classes - 1, 1)).astype("float32")
+        self.attrs = {'num_classes': num_classes, 'is_sparse': False}
+        self.inputs = {
+            'X': x,
+            'W': w,
+            'PTable': path_table,
+            'PathCode': path_code,
+            'Label': label,
+            'Bias': bias
+        }
+        pre_output, out = hsigmoidWithCustomTree(x, w, path_table, path_code,
+                                                 label, bias, num_classes)
+        self.outputs = {'PreOut': pre_output, 'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
+
+
+class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest):
+    def setUp(self):
+        self.op_type = "hierarchical_sigmoid"
+        num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
+        feature_size = 8
+        batch_size = 4
+        x = np.random.random((batch_size, feature_size)).astype("float32") * 2
+        w = np.random.random(
+            (num_classes - 1, feature_size)).astype("float32") * 2
+        label = np.array([0, 1, 4, 5])
+        path_table = np.array(
+            [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
+             (0, 2, -1, -1,
+              -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
+        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
+            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
+        # bias = np.random.random((num_classes - 1, 1)).astype("float32")
+        self.attrs = {'num_classes': num_classes, 'is_sparse': False}
+        self.inputs = {
+            'X': x,
+            'W': w,
+            'PTable': path_table,
+            'PathCode': path_code,
+            'Label': label,
+        }
+        pre_output, out = hsigmoidWithCustomTree(
+            x=x,
+            w=w,
+            path_table=path_table,
+            path_code=path_code,
+            label=label,
+            bias=None,
+            num_classes=num_classes)
+        self.outputs = {'PreOut': pre_output, 'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'W'], ['Out'], no_grad_set=set('Label'))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 559c9cda48..be51fb06a3 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -170,9 +170,10 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             dat = layers.data(name='data', shape=[10], dtype='float32')
             lbl = layers.data(name='label', shape=[10], dtype='float32')
+            ignore_index = -1
             self.assertIsNotNone(
                 layers.sigmoid_cross_entropy_with_logits(
-                    x=dat, label=lbl))
+                    x=dat, label=lbl, ignore_index=ignore_index))
         print(str(program))
 
     def test_hsigmoid(self):
@@ -185,6 +186,25 @@ class TestBook(unittest.TestCase):
                     input=x, label=y, num_classes=2))
         print(str(program))
 
+        # test hsigmod with custom tree structure
+        program2 = Program()
+        with program_guard(program2):
+            x2 = layers.data(name='x2', shape=[4, 8], dtype='float32')
+            y2 = layers.data(name='y2', shape=[4], dtype='int64')
+            path_table = layers.data(
+                name='path_table', shape=[4, 6], dtype='int64')
+            path_code = layers.data(
+                name='path_code', shape=[4, 6], dtype='int64')
+            self.assertIsNotNone(
+                layers.hsigmoid(
+                    input=x2,
+                    label=y2,
+                    num_classes=6,
+                    path_table=path_table,
+                    path_code=path_code,
+                    is_custom=True))
+            print(str(program2))
+
     def test_sequence_expand(self):
         program = Program()
         with program_guard(program):
@@ -617,13 +637,21 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             input = layers.data(
                 name="input", shape=[3, 100, 100], dtype="float32")
+            paddings = layers.fill_constant(shape=[4], dtype='int32', value=1)
             out = layers.pad2d(
                 input,
                 paddings=[1, 2, 3, 4],
                 mode='reflect',
                 data_format='NCHW',
                 name="shape")
+            out_1 = layers.pad2d(
+                input,
+                paddings=paddings,
+                mode='reflect',
+                data_format='NCHW',
+                name="shape")
             self.assertIsNotNone(out)
+            self.assertIsNotNone(out_1)
         print(str(program))
 
     def test_prelu(self):
@@ -936,6 +964,15 @@ class TestBook(unittest.TestCase):
 
         print(str(program))
 
+    def test_batch_norm(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(
+                name='data', shape=[32, 128, 128], dtype="float32")
+            out = layers.batch_norm(data)
+
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
new file mode 100644
index 0000000000..47830fb56b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
@@ -0,0 +1,203 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import signal
+import time
+import unittest
+from multiprocessing import Process
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.framework import Program, program_guard
+
+
+def run_pserver(pserver_id, use_cuda, sync_mode):
+    scope = fluid.core.Scope()
+    program = Program()
+    with fluid.scope_guard(scope):
+        with program_guard(program, startup_program=Program()):
+            # create table parameter in scope
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            # create and initialize Param Variable
+            param = scope.var('table').get_tensor()
+
+            param_array = np.ones((10, 8)).astype("float32")
+            for i in range(len(param_array)):
+                param_array[i] *= param_array[i] * i + pserver_id * 10
+            param.set(param_array, place)
+
+            optimize_block = program._create_block(program.global_block().idx)
+            program.global_block().append_op(
+                type="listen_and_serv",
+                inputs={'X': []},
+                outputs={},
+                attrs={
+                    "optimize_blocks": [optimize_block],
+                    "endpoint": '127.0.0.1:0',
+                    "Fanin": 1,
+                    "sync_mode": True,
+                    "grad_to_block_id": []
+                })
+
+            exe = fluid.Executor(place)
+            exe.run(program)
+
+
+class TestListenAndServOp(unittest.TestCase):
+    def setUp(self):
+        self.ps_timeout = 5
+
+    def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
+        p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
+        p.daemon = True
+        p.start()
+        return p
+
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
+    def _get_pserver_port(self, pid):
+        with open("/tmp/paddle.%d.port" % pid, 'r') as f:
+            port = int(f.read().strip())
+        return port
+
+    def _run_lookup_table_op_one_pserver(self, place, port):
+        scope = fluid.core.Scope()
+        program = Program()
+        with fluid.scope_guard(scope):
+            with program_guard(program, startup_program=Program()):
+                # create and initialize Param Variable
+                param = scope.var('W').get_tensor()
+                param_array = np.full((10, 8), 1.0).astype("float32")
+                param.set(param_array, place)
+
+                ids = scope.var('Ids').get_tensor()
+                ids_array = np.array([[1], [2], [5]]).astype("int64")
+                ids.set(ids_array, place)
+                ids_lod = [[0, 1, 2, 3]]
+                ids.set_lod(ids_lod)
+
+                out = scope.var('Out').get_tensor()
+
+                emaps = ['127.0.0.1:' + str(port)]
+                table_names = ['table']
+                height_sections = [10]
+
+                # create and run sgd operator
+                lookup_table_op = Operator(
+                    "lookup_table",
+                    W='W',
+                    Ids='Ids',
+                    Out='Out',
+                    remote_prefetch=True,
+                    epmap=emaps,
+                    table_names=table_names,
+                    height_sections=height_sections)
+                lookup_table_op.run(scope, place)
+
+                # get and compare result
+                result_array = np.array(out)
+
+                self.assertEqual(out.lod(), ids_lod)
+                self.assertEqual(list(result_array.shape), [len(ids_array), 8])
+                for i in range(len(ids_array)):
+                    id = ids_array[i][0]
+                    self.assertTrue((result_array[i] == id).all())
+
+    def _run_lookup_table_op_two_pserver(self, place, port0, port1):
+        scope = fluid.core.Scope()
+        program = Program()
+        with fluid.scope_guard(scope):
+            with program_guard(program, startup_program=Program()):
+                # create and initialize Param Variable
+                param = scope.var('W').get_tensor()
+                param_array = np.full((10, 8), 1.0).astype("float32")
+                param.set(param_array, place)
+
+                ids = scope.var('Ids').get_tensor()
+                ids_array = np.array([[1], [2], [11], [13]]).astype("int64")
+                ids.set(ids_array, place)
+                ids_lod = [[0, 2, 3, 4]]
+                ids.set_lod(ids_lod)
+
+                out = scope.var('Out').get_tensor()
+
+                emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
+                table_names = ['table', 'table']
+                height_sections = [10, 20]
+
+                # create and run sgd operator
+                lookup_table_op = Operator(
+                    "lookup_table",
+                    W='W',
+                    Ids='Ids',
+                    Out='Out',
+                    remote_prefetch=True,
+                    epmap=emaps,
+                    table_names=table_names,
+                    height_sections=height_sections)
+                lookup_table_op.run(scope, place)
+
+                # get and compare result
+                result_array = np.array(out)
+                self.assertEqual(out.lod(), ids_lod)
+                self.assertEqual(list(result_array.shape), [len(ids_array), 8])
+                for i in range(len(ids_array)):
+                    id = ids_array[i][0]
+                    self.assertTrue((result_array[i] == id).all())
+
+    def test_lookup_remote_table(self):
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
+        # run pserver on CPU in sync mode
+        p0 = self._start_pserver(0, False, True, run_pserver)
+        self._wait_ps_ready(p0.pid)
+        port0 = self._get_pserver_port(p0.pid)
+
+        p1 = self._start_pserver(1, False, True, run_pserver)
+        self._wait_ps_ready(p1.pid)
+        port1 = self._get_pserver_port(p1.pid)
+
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self._run_lookup_table_op_one_pserver(place, port0)
+            self._run_lookup_table_op_two_pserver(place, port0, port1)
+
+        # raise SIGTERM to pserver
+        os.kill(p0.pid, signal.SIGINT)
+        p0.join()
+        os.kill(p1.pid, signal.SIGINT)
+        p1.join()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
new file mode 100644
index 0000000000..0e9e2e8429
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -0,0 +1,192 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle.fluid as fluid
+
+SIGMOID_THRESHOLD_MIN = -40.0
+SIGMOID_THRESHOLD_MAX = 13.0
+EXP_MAX_INPUT = 40.0
+
+
+def lstm_naive(
+        input,
+        w, ):
+    seq_len, batch_size, hidden_size = input.shape
+
+    offset = 0
+    wi = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    wf = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    wc = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    wo = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    ri = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    rf = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    rc = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    ro = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+
+    bi_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bf_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bc_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bo_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+
+    bi_2 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bf_2 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bc_2 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bo_2 = w[offset:offset + hidden_size]
+
+    def sigmoid(x):
+        y = np.copy(x)
+        y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
+        y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
+        return 1. / (1. + np.exp(-y))
+
+    def tanh(x):
+        y = -2. * x
+        y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
+        return (2. / (1. + np.exp(y))) - 1.
+
+    output = []
+    pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype)
+    pre_c = np.zeros((batch_size, hidden_size), dtype=input.dtype)
+
+    for i in range(seq_len):
+        emb_1 = input[i]
+
+        input_gate = sigmoid(
+            np.matmul(emb_1, wi) + np.matmul(pre_h, ri) + bi_1 + bi_2)
+        forget_gate = sigmoid(
+            np.matmul(emb_1, wf) + np.matmul(pre_h, rf) + bf_1 + bf_2)
+        output_gate = sigmoid(
+            np.matmul(emb_1, wo) + np.matmul(pre_h, ro) + bo_1 + bo_2)
+        c_t_temp = tanh(
+            np.matmul(emb_1, wc) + np.matmul(pre_h, rc) + bc_1 + bc_2)
+        new_c = input_gate * c_t_temp + forget_gate * pre_c
+        new_h = output_gate * tanh(new_c)
+
+        pre_h = new_h
+        pre_c = new_c
+
+        output.append(new_h)
+
+    output = np.concatenate(output, -1)
+    output = output.reshape((batch_size, -1, hidden_size))
+
+    output = output.transpose((1, 0, 2))
+
+    return output, pre_h, pre_c
+
+
+class TestCUDNNLstmOp(OpTest):
+    def setUp(self):
+        self.op_type = "cudnn_lstm"
+        self.dtype = np.float32
+
+        num_steps = 20
+        batch_size = 5
+        hidden_size = 20
+
+        input_weight_size = (hidden_size * hidden_size) * 4
+        hidden_weight_size = (hidden_size * hidden_size) * 4
+        weight_size = input_weight_size + hidden_weight_size
+        weight_size += hidden_size * 8
+
+        input = np.random.uniform(
+            low=-0.1, high=0.1, size=(num_steps, batch_size,
+                                      hidden_size)).astype(self.dtype)
+        flat_w = np.random.uniform(
+            low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype)
+
+        output, last_hidden, last_cell = lstm_naive(input, flat_w)
+
+        init_h = np.zeros((batch_size, hidden_size), dtype=np.float32)
+        init_c = np.zeros((batch_size, hidden_size), dtype=np.float32)
+        scope = core.Scope()
+        program = fluid.Program()
+        block = program.global_block()
+
+        cache_temp = block.create_var(
+            name="Cache",
+            persistable=True,
+            type=core.VarDesc.VarType.RAW,
+            stop_gradient=True)
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'W': OpTest.np_dtype_to_fluid_dtype(flat_w),
+            'InitH': OpTest.np_dtype_to_fluid_dtype(init_h),
+            'InitC': OpTest.np_dtype_to_fluid_dtype(init_c),
+        }
+        self.cache_name_list = ['Cache']
+        self.attrs = {
+            'max_len': num_steps,
+            'dropout_prob': 0.0,
+            'is_bidirec': False,
+            'input_size': hidden_size,
+            'hidden_size': hidden_size,
+            'num_layers': 1,
+        }
+        self.outputs = {
+            'Out': output,
+            "last_h": last_hidden,
+            'last_c': last_cell
+        }
+
+    def test_output_with_place(self):
+        if self.testcuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+
+    def test_grad_with_place(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'W', 'InitH', 'InitC']),
+                ['Out', 'last_h', 'last_c'],
+                max_relative_error=0.02)
+
+    def testcuda(self):
+        return core.is_compiled_with_cuda()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index c01fdd5ddd..f4f9744674 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -14,8 +14,12 @@
 
 from __future__ import print_function
 
-import unittest
 import numpy as np
+import unittest
+
+import paddle.fluid as fluid
+import paddle.fluid.initializer as initializer
+
 from op_test import OpTest
 
 
@@ -59,7 +63,7 @@ def nce(input, weight, bias, sample_weight, labels, num_classes,
 
 class TestNCE(OpTest):
     def generate_data(self, dim, batch_size, num_classes, num_true_class,
-                      num_neg_samples):
+                      num_neg_samples, is_sparse):
         input = np.random.randn(batch_size, dim).astype(np.float32)
         weight = np.random.randn(num_classes, dim).astype(np.float32)
         bias = np.random.randn(num_classes).astype(np.float32)
@@ -70,7 +74,8 @@ class TestNCE(OpTest):
             'num_neg_samples': num_neg_samples,
             'custom_neg_classes': list(range(num_neg_samples)),
             'seed': 0,
-            'sampler': 0
+            'sampler': 0,
+            'is_sparse': is_sparse
         }
         self.inputs = {
             'Input': input,
@@ -81,7 +86,7 @@ class TestNCE(OpTest):
         }
 
     def set_data(self):
-        self.generate_data(5, 5, 4, 1, 2)
+        self.generate_data(5, 5, 4, 1, 2, False)
 
     def compute(self):
         out = nce(self.inputs['Input'], self.inputs['Weight'],
@@ -107,9 +112,110 @@ class TestNCE(OpTest):
             ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02)
 
 
-class TestNCECase1(TestNCE):
+class TestNCECase1Tensor(TestNCE):
     def set_data(self):
-        self.generate_data(10, 20, 10, 2, 5)
+        self.generate_data(10, 20, 10, 2, 5, False)
+
+
+class TestNCECase1SelectedRows(unittest.TestCase):
+    def setUp(self):
+        self.base_lr = 0.0001
+        self.batch_size = 8
+
+    @staticmethod
+    def get_place():
+        place = fluid.core.CPUPlace()
+        return place
+
+    @staticmethod
+    def get_train_data(batch_size):
+        batchs = []
+        for i in range(batch_size):
+            input = np.random.randn(batch_size, 10).astype(np.float32)
+            labels = np.random.randint(0, 20, (batch_size, 1))
+            batchs.append([input, labels])
+        return batchs
+
+    def get_optimizer(self):
+        # SGD optimizer
+        optimizer = fluid.optimizer.SGD(learning_rate=self.base_lr)
+        return optimizer
+
+    def train_network(self, num_total_classes, num_neg_samples, sampler,
+                      custom_dist, is_sparse):
+        input = fluid.layers.data(name="input", shape=[10], dtype="float32")
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+        w_param = fluid.default_main_program().global_block().create_parameter(
+            shape=[num_total_classes, 10],
+            dtype='float32',
+            name='nce_w',
+            initializer=initializer.ConstantInitializer())
+        b_param = fluid.default_main_program().global_block().create_parameter(
+            shape=[num_total_classes, 1],
+            dtype='float32',
+            name='nce_b',
+            initializer=initializer.ConstantInitializer())
+
+        cost = fluid.layers.nce(input=input,
+                                label=label,
+                                num_total_classes=num_total_classes,
+                                sampler=sampler,
+                                custom_dist=custom_dist,
+                                sample_weight=None,
+                                param_attr='nce_w',
+                                bias_attr='nce_b',
+                                seed=1,
+                                num_neg_samples=num_neg_samples,
+                                is_sparse=is_sparse)
+        avg_cost = fluid.layers.mean(cost)
+        # optimizer
+        optimizer = self.get_optimizer()
+        optimizer.minimize(avg_cost)
+
+        return [avg_cost, [input, label]]
+
+    def test_input_is_selected_rows(self):
+        place = self.get_place()
+        exe = fluid.Executor(place)
+
+        data = self.get_train_data(self.batch_size)
+        nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
+
+        rets = []
+        # for dense
+        dense_scope = fluid.core.Scope()
+        dense_startup_program = fluid.framework.Program()
+        dense_train_program = fluid.framework.Program()
+        with fluid.scope_guard(dense_scope):
+            with fluid.program_guard(dense_train_program,
+                                     dense_startup_program):
+                cost, feeds = self.train_network(20, 5, "custom_dist",
+                                                 nid_freq_arr.tolist(), False)
+                feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+                exe.run(dense_startup_program)
+                loss_val = exe.run(dense_train_program,
+                                   feed=feeder.feed(data),
+                                   fetch_list=[cost.name])
+                rets.append(np.mean(loss_val))
+
+        # for sparse
+        sparse_scope = fluid.core.Scope()
+        sparse_startup_program = fluid.framework.Program()
+        sparse_train_program = fluid.framework.Program()
+        with fluid.scope_guard(sparse_scope):
+            with fluid.program_guard(sparse_train_program,
+                                     sparse_startup_program):
+                cost, feeds = self.train_network(20, 5, "custom_dist",
+                                                 nid_freq_arr.tolist(), True)
+                feeder = fluid.DataFeeder(feed_list=feeds, place=place)
+                exe.run(sparse_startup_program)
+                loss_val = exe.run(sparse_train_program,
+                                   feed=feeder.feed(data),
+                                   fetch_list=[cost.name])
+                rets.append(np.mean(loss_val))
+
+        self.assertEqual(rets[0], rets[1])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
new file mode 100644
index 0000000000..242709425f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
@@ -0,0 +1,197 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+def nearest_neighbor_interp_np(X,
+                               out_h,
+                               out_w,
+                               out_size=None,
+                               actual_shape=None):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    n, c, in_h, in_w = X.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        ratio_h = (in_h - 1.0) / (out_h - 1.0)
+    if out_w > 1:
+        ratio_w = (in_w - 1.0) / (out_w - 1.0)
+
+    out = np.zeros((n, c, out_h, out_w))
+    for i in range(out_h):
+        in_i = int(ratio_h * i + 0.5)
+        for j in range(out_w):
+            in_j = int(ratio_w * j + 0.5)
+            out[:, :, i, j] = X[:, :, in_i, in_j]
+
+    return out.astype(X.dtype)
+
+
+class TestNearestInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp"
+        input_np = np.random.random(self.input_shape).astype("float32")
+
+        output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
+                                               self.out_size, self.actual_shape)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 4, 4]
+        self.out_h = 2
+        self.out_w = 2
+        self.out_size = np.array([3, 3]).astype("int32")
+
+
+class TestNearestNeighborInterpCase1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+
+
+class TestNearestNeighborInterpCase2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestNearestNeighborInterpCase3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+
+
+class TestNearestNeighborInterpCase4(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestNearestNeighborInterpCase5(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestNearestNeighborInterpCase6(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.out_size = np.array([66, 40]).astype("int32")
+
+
+class TestNearestInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "nearest_interp"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+        output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
+                                               self.out_size, self.actual_shape)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+
+
+class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 3, 128, 64]
+        self.out_h = 120
+        self.out_w = 50
+
+
+class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.out_size = np.array([6, 15]).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
index 728b8c181a..5c4a6ca59e 100644
--- a/python/paddle/fluid/tests/unittests/test_pad2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
@@ -20,11 +20,17 @@ from op_test import OpTest
 class TestPad2dOp(OpTest):
     def setUp(self):
         self.pad_value = 0.0
+        self.variable_paddings = False
         self.initTestCase()
         self.op_type = "pad2d"
         self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
         self.attrs = {}
-        self.attrs['paddings'] = np.array(self.paddings).flatten()
+        if self.variable_paddings:
+            self.attrs['paddings'] = []
+            self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
+                "int32")
+        else:
+            self.attrs['paddings'] = np.array(self.paddings).flatten()
         self.attrs['pad_value'] = self.pad_value
         self.attrs['mode'] = self.mode
         self.attrs['data_format'] = self.data_format
@@ -98,5 +104,24 @@ class TestCase5(TestPad2dOp):
         self.data_format = "NHWC"
 
 
+class TestCase6(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "constant"
+        self.pad_value = 1.2
+        self.data_format = "NHWC"
+        self.variable_paddings = True
+
+
+class TestCase7(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NCHW"
+        self.variable_paddings = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index f5a0ba6246..db2826653e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -88,7 +88,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
 
                 self.assertTrue(
                     np.allclose(
-                        train_loss, test_loss, atol=1e-8),
+                        train_loss, test_loss, atol=1e-2),
                     "Train loss: " + str(train_loss) + "\n Test loss:" +
                     str(test_loss))
 
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index 97ff203499..41797a241c 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
     """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
     """
 
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+        self.attrs = {'ignore_index': ignore_index, }
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
         batch_size = 64
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
new file mode 100644
index 0000000000..544fe4b4f8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -0,0 +1,215 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+from paddle.fluid import core
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def mse(x, y, num):
+    return ((y - x)**2).sum() / num
+
+
+def bce(x, y, mask):
+    x = x.reshape((-1))
+    y = y.reshape((-1))
+    mask = mask.reshape((-1))
+
+    error_sum = 0.0
+    count = 0
+    for i in range(x.shape[0]):
+        if mask[i] > 0:
+            error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i])
+            count += 1
+    return error_sum / (-1.0 * count)
+
+
+def box_iou(box1, box2):
+    b1_x1 = box1[0] - box1[2] / 2
+    b1_x2 = box1[0] + box1[2] / 2
+    b1_y1 = box1[1] - box1[3] / 2
+    b1_y2 = box1[1] + box1[3] / 2
+    b2_x1 = box2[0] - box2[2] / 2
+    b2_x2 = box2[0] + box2[2] / 2
+    b2_y1 = box2[1] - box2[3] / 2
+    b2_y2 = box2[1] + box2[3] / 2
+
+    b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
+    b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
+
+    inter_rect_x1 = max(b1_x1, b2_x1)
+    inter_rect_y1 = max(b1_y1, b2_y1)
+    inter_rect_x2 = min(b1_x2, b2_x2)
+    inter_rect_y2 = min(b1_y2, b2_y2)
+    inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max(
+        inter_rect_y2 - inter_rect_y1, 0)
+
+    return inter_area / (b1_area + b2_area + inter_area)
+
+
+def build_target(gtboxs, gtlabel, attrs, grid_size):
+    n, b, _ = gtboxs.shape
+    ignore_thresh = attrs["ignore_thresh"]
+    anchors = attrs["anchors"]
+    class_num = attrs["class_num"]
+    an_num = len(anchors) // 2
+    obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32')
+    tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    tcls = np.zeros(
+        (n, an_num, grid_size, grid_size, class_num)).astype('float32')
+
+    for i in range(n):
+        for j in range(b):
+            if gtboxs[i, j, :].sum() == 0:
+                continue
+
+            gt_label = gtlabel[i, j]
+            gx = gtboxs[i, j, 0] * grid_size
+            gy = gtboxs[i, j, 1] * grid_size
+            gw = gtboxs[i, j, 2] * grid_size
+            gh = gtboxs[i, j, 3] * grid_size
+
+            gi = int(gx)
+            gj = int(gy)
+
+            gtbox = [0, 0, gw, gh]
+            max_iou = 0
+            for k in range(an_num):
+                anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]]
+                iou = box_iou(gtbox, anchor_box)
+                if iou > max_iou:
+                    max_iou = iou
+                    best_an_index = k
+                if iou > ignore_thresh:
+                    noobj_mask[i, best_an_index, gj, gi] = 0
+
+            obj_mask[i, best_an_index, gj, gi] = 1
+            noobj_mask[i, best_an_index, gj, gi] = 0
+            tx[i, best_an_index, gj, gi] = gx - gi
+            ty[i, best_an_index, gj, gi] = gy - gj
+            tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 *
+                                                               best_an_index])
+            th[i, best_an_index, gj, gi] = np.log(
+                gh / anchors[2 * best_an_index + 1])
+            tconf[i, best_an_index, gj, gi] = 1
+            tcls[i, best_an_index, gj, gi, gt_label] = 1
+
+    return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask)
+
+
+def YoloV3Loss(x, gtbox, gtlabel, attrs):
+    n, c, h, w = x.shape
+    an_num = len(attrs['anchors']) // 2
+    class_num = attrs["class_num"]
+    x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
+    pred_x = sigmoid(x[:, :, :, :, 0])
+    pred_y = sigmoid(x[:, :, :, :, 1])
+    pred_w = x[:, :, :, :, 2]
+    pred_h = x[:, :, :, :, 3]
+    pred_conf = sigmoid(x[:, :, :, :, 4])
+    pred_cls = sigmoid(x[:, :, :, :, 5:])
+
+    tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target(
+        gtbox, gtlabel, attrs, x.shape[2])
+
+    obj_mask_expand = np.tile(
+        np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num'])))
+    loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum())
+    loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum())
+    loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum())
+    loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum())
+    loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask)
+    loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask,
+                             noobj_mask)
+    loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand,
+                     obj_mask_expand)
+
+    return attrs['loss_weight_xy'] * (loss_x + loss_y) \
+            + attrs['loss_weight_wh'] * (loss_w + loss_h) \
+            + attrs['loss_weight_conf_target'] * loss_conf_target \
+            + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \
+            + attrs['loss_weight_class'] * loss_class
+
+
+class TestYolov3LossOp(OpTest):
+    def setUp(self):
+        self.loss_weight_xy = 1.0
+        self.loss_weight_wh = 1.0
+        self.loss_weight_conf_target = 1.0
+        self.loss_weight_conf_notarget = 1.0
+        self.loss_weight_class = 1.0
+        self.initTestCase()
+        self.op_type = 'yolov3_loss'
+        x = np.random.random(size=self.x_shape).astype('float32')
+        gtbox = np.random.random(size=self.gtbox_shape).astype('float32')
+        gtlabel = np.random.randint(0, self.class_num,
+                                    self.gtbox_shape[:2]).astype('int32')
+
+        self.attrs = {
+            "anchors": self.anchors,
+            "class_num": self.class_num,
+            "ignore_thresh": self.ignore_thresh,
+            "loss_weight_xy": self.loss_weight_xy,
+            "loss_weight_wh": self.loss_weight_wh,
+            "loss_weight_conf_target": self.loss_weight_conf_target,
+            "loss_weight_conf_notarget": self.loss_weight_conf_notarget,
+            "loss_weight_class": self.loss_weight_class,
+        }
+
+        self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel}
+        self.outputs = {
+            'Loss': np.array(
+                [YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32')
+        }
+
+    def test_check_output(self):
+        place = core.CPUPlace()
+        self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad_ignore_gtbox(self):
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['X'],
+            'Loss',
+            no_grad_set=set(["GTBox", "GTLabel"]),
+            max_relative_error=0.06)
+
+    def initTestCase(self):
+        self.anchors = [10, 13, 12, 12]
+        self.class_num = 10
+        self.ignore_thresh = 0.5
+        self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7)
+        self.gtbox_shape = (5, 10, 4)
+        self.loss_weight_xy = 2.5
+        self.loss_weight_wh = 0.8
+        self.loss_weight_conf_target = 1.5
+        self.loss_weight_conf_notarget = 0.5
+        self.loss_weight_class = 1.2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index 34fbb1b549..dc3b2cb8bc 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -20,7 +20,7 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def create_op(scope, op_type, inputs, outputs, attrs):
+def create_op(scope, op_type, inputs, outputs, attrs, cache_list=None):
     kwargs = dict()
 
     op_maker = core.op_proto_and_checker_maker
@@ -43,6 +43,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
                     __create_var__(in_name, sub_in_name)
             else:
                 __create_var__(in_name, in_name)
+    if cache_list != None and isinstance(cache_list, list):
+        for name in cache_list:
+            kwargs[name] = []
+            scope.var(name)
+            kwargs[name].append(name)
 
     for out_name, out_dup in Operator.get_op_outputs(op_type):
         if out_name in outputs:
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index ebd0d18d36..5d348f0995 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -236,6 +236,31 @@ class DistributeTranspiler(object):
         else:
             raise ValueError("must set trainer_id > 0")
 
+    def _get_all_remote_sparse_update_op(self, main_program):
+        sparse_update_ops = []
+        sparse_update_op_types = ["lookup_table"]
+        for op in main_program.global_block().ops:
+            if op.type in sparse_update_op_types and op.attr(
+                    'remote_prefetch') is True and not op.attr(
+                        'is_distributed'):
+                sparse_update_ops.append(op)
+        return sparse_update_ops
+
+    def _update_remote_sparse_update_op(self, param_varname, height_sections,
+                                        endpint_map, table_names):
+        for op in self.sparse_update_ops:
+            if param_varname in op.input_arg_names:
+                op._set_attr('epmap', endpint_map)
+                op._set_attr('table_names', table_names)
+                op._set_attr('height_sections', height_sections)
+                op._set_attr('trainer_id', self.trainer_id)
+
+    def _is_input_of_remote_sparse_update_op(self, param_name):
+        for op in self.sparse_update_ops:
+            if param_name in op.input_arg_names:
+                return True
+        return False
+
     def transpile(self,
                   trainer_id,
                   program=None,
@@ -299,6 +324,12 @@ class DistributeTranspiler(object):
             self.param_name_to_grad_name[param_var.name] = grad_var.name
             self.grad_name_to_param_name[grad_var.name] = param_var.name
 
+        # get all sparse update ops
+        self.sparse_update_ops = self._get_all_remote_sparse_update_op(
+            self.origin_program)
+        # use_sparse_update_param_name -> split_height_section
+        self.sparse_param_to_height_sections = dict()
+
         # add distributed attrs to program
         self.origin_program._is_distributed = True
         self.origin_program._endpoints = self.pserver_endpoints
@@ -336,6 +367,13 @@ class DistributeTranspiler(object):
                 splited_grad_varname = splited_vars[0].name
                 index = find_op_by_output_arg(
                     program.global_block(), splited_grad_varname, reverse=True)
+                if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
+                    sparse_param_name = self.grad_name_to_param_name[
+                        grad_varname]
+                    if self._is_input_of_remote_sparse_update_op(
+                            sparse_param_name):
+                        self.sparse_param_to_height_sections[
+                            sparse_param_name] = [splited_vars[0].shape[0]]
             elif len(splited_vars) > 1:
                 orig_var = program.global_block().vars[splited_grad_varname]
                 index = find_op_by_output_arg(
@@ -406,16 +444,18 @@ class DistributeTranspiler(object):
         all_recv_outputs = []
         for param_varname, splited_var in six.iteritems(self.param_var_mapping):
             eps = []
+            table_names = []
             for var in splited_var:
                 index = [v.name for v in recv_vars].index(var.name)
                 eps.append(eplist[index])
+                table_names.append(var.name)
             if self.sync_mode:
                 recv_dep_in = send_barrier_out
             else:
                 # connect deps to send op in async mode
                 recv_dep_in = self.grad_name_to_send_dummy_out[
                     self.param_name_to_grad_name[param_varname]]
-            all_recv_outputs.extend(splited_var)
+
             # get recv op_role_var, if not splited, the grad should have .trainer suffix
             # if splited, grad should be the original grad var name. ParallelExecutor
             # will use op_role_var to get expected device place to run this op.
@@ -425,18 +465,25 @@ class DistributeTranspiler(object):
             if len(splited_trainer_grad) == 1:
                 recv_op_role_var_name = splited_trainer_grad[0].name
 
-            program.global_block().append_op(
-                type="recv",
-                inputs={"X": [recv_dep_in]},
-                outputs={"Out": splited_var},
-                attrs={
-                    "epmap": eps,
-                    "trainer_id": self.trainer_id,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
-                    OP_ROLE_VAR_ATTR_NAME:
-                    [param_varname, recv_op_role_var_name],
-                    "sync_mode": not self.sync_mode
-                })
+            if param_varname in self.sparse_param_to_height_sections:
+                height_sections = self.sparse_param_to_height_sections[
+                    param_varname]
+                self._update_remote_sparse_update_op(
+                    param_varname, height_sections, eps, table_names)
+            else:
+                all_recv_outputs.extend(splited_var)
+                program.global_block().append_op(
+                    type="recv",
+                    inputs={"X": [recv_dep_in]},
+                    outputs={"Out": splited_var},
+                    attrs={
+                        "epmap": eps,
+                        "trainer_id": self.trainer_id,
+                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                        OP_ROLE_VAR_ATTR_NAME:
+                        [param_varname, recv_op_role_var_name],
+                        "sync_mode": not self.sync_mode
+                    })
 
         if self.sync_mode:
             # form a WAW dependency
@@ -454,14 +501,15 @@ class DistributeTranspiler(object):
             if len(splited_var) <= 1:
                 continue
             orig_param = program.global_block().vars[param_varname]
-            program.global_block().append_op(
-                type="concat",
-                inputs={"X": splited_var},
-                outputs={"Out": [orig_param]},
-                attrs={
-                    "axis": 0,
-                    RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
-                })
+            if param_varname not in self.sparse_param_to_height_sections:
+                program.global_block().append_op(
+                    type="concat",
+                    inputs={"X": splited_var},
+                    outputs={"Out": [orig_param]},
+                    attrs={
+                        "axis": 0,
+                        RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
+                    })
 
         self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)
 
@@ -1420,6 +1468,10 @@ to transpile() call.")
             height_sections = []
             for v in splited_vars:
                 height_sections.append(v.shape[0])
+            sparse_param_name = self.grad_name_to_param_name[orig_var.name]
+            if self._is_input_of_remote_sparse_update_op(sparse_param_name):
+                self.sparse_param_to_height_sections[
+                    sparse_param_name] = height_sections
             program.global_block()._insert_op(
                 index=index + 1,
                 type="split_selected_rows",
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index b9af8348e1..a9dddbbcc8 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -62,10 +62,10 @@ class TestBuffered(unittest.TestCase):
         for idx, i in enumerate(b()):
             elapsed_time = time.time() - last_time
             if i == 0:
-                time.sleep(0.3)
+                time.sleep(1)
             else:
                 # read time should be short, meaning already buffered.
-                self.assertLess(elapsed_time, 0.05)
+                self.assertLess(elapsed_time, 0.08)
             last_time = time.time()
 
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 200b96ec54..5aee26b638 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -165,9 +165,9 @@ if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_LIB}', libs_path)
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
-if '${CMAKE_BUILD_TYPE}' == 'Release':
-    # only change rpath in Release mode.
-    if '${WITH_MKLDNN}' == 'ON':
+if '${WITH_MKLDNN}' == 'ON':
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        # only change rpath in Release mode.
         # TODO(typhoonzero): use install_name_tool to patch mkl libs once
         # we can support mkl on mac.
         #
@@ -177,14 +177,19 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
         command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
         if os.system(command) != 0:
             raise Exception("patch libmkldnn.so failed, command: %s" % command)
-        package_data['paddle.libs']+=['libmkldnn.so.0']
-        shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+    package_data['paddle.libs']+=['libmkldnn.so.0']
+    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 if '${WITH_NGRAPH}' == 'ON':
+    # only change rpath in Release mode,
+    # since in Debug mode, nGraph lib may be too large to be changed?
     if '${CMAKE_BUILD_TYPE}' == 'Release':
-        # only change rpath in Release mode.
-        command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
-        if os.system(command) != 0:
-            raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
+        if os.name != 'nt':
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/\" ${NGRAPH_SHARED_LIB}"
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
+            if os.system(command) != 0:
+                raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
     shutil.copy('${NGRAPH_SHARED_LIB}', libs_path)
     shutil.copy('${NGRAPH_CPU_LIB}', libs_path)
     shutil.copy('${NGRAPH_TBB_LIB}', libs_path)